In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [64]:
DATA_PATH = '~/UNI/sem3/DS/Project/data'
DATA_PREF = ['Fat_Supply_Quantity','Food_Supply_kcal','Food_Supply_Quantity_kg', 'Protein_Supply_Quantity']
DATA_DESCR = 'Supply_Food_Data_Descriptions'
TARGET_COLUMNS = ['Obesity','Undernourished', 'Confirmed',
'Deaths',
'Recovered',
'Active',
'Population',
'Unit (all except Population)']
def load_data(data_path=DATA_PATH, data_prefixes=DATA_PREF, data_description=DATA_DESCR):
    dfs = []
    target = None
    desc = pd.read_csv(os.path.join(data_path, data_description + '.csv')).set_index('Categories')
    

    for prefix in data_prefixes:
        df = pd.read_csv(os.path.join(data_path, prefix + '_Data.csv')).set_index('Country')
        
        target = df[TARGET_COLUMNS]
        df.drop(columns=TARGET_COLUMNS, inplace=True)
        
        df.columns = ['_'.join([i, prefix]) for i in df.columns]
        
        for cat in desc.index:
            new_cats = desc.loc[cat].str.split(pat=';')['Items']
            
            for new_cat in new_cats:
                df['_'.join([new_cat, prefix])] = df['_'.join([cat, prefix])]
        dfs.append(df)
    
    return pd.concat(dfs), target.drop(columns=TARGET_COLUMNS[-1])

In [65]:
data = load_data()

In [66]:
data[1]
    
    

Unnamed: 0_level_0,Obesity,Undernourished,Confirmed,Deaths,Recovered,Active,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Afghanistan,4.5,29.8,0.039969,0.000676,0.003491,0.035803,38042000.0
Albania,22.3,6.2,0.039783,0.001155,0.030511,0.008118,2858000.0
Algeria,26.6,3.9,0.021642,0.001504,0.013242,0.006895,43406000.0
Angola,6.8,25,0.000274,0.000013,0.000057,0.000204,31427000.0
Antigua and Barbuda,19.1,,0.026804,0.003093,0.019588,0.004124,97000.0
...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),25.2,21.2,0.005295,0.000049,0.001059,0.004187,28516000.0
Vietnam,2.1,9.3,0.000343,0.000000,0.000292,0.000051,95656000.0
Yemen,14.1,38.9,0.001108,0.000274,0.000048,0.000785,29162000.0
Zambia,6.5,46.7,0.005918,0.000039,0.004361,0.001517,17861000.0


In [122]:
def clean_data(data):
    
    threshold = 0.85
    #Dropping columns with missing value rate higher than threshold
    data = data[data.columns[data.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    data = data.loc[data.isnull().mean(axis=1) < threshold]
    for c in data.columns:
        #object -> float 
        if data[c].dtype == 'object':
            data[c] = data[c].str.replace("<","")
            if len(set(data[c])) <= 3:
                data.drop(columns=c, inplace=True)
            else:
                data[c] = data[c].astype('float64')

    #fill missing values with -1
    return data.fillna(-1)
            

In [123]:
new_X = clean_data(data[0])

In [127]:
new_X

Unnamed: 0_level_0,Alcoholic Beverages_Fat_Supply_Quantity,Animal Products_Fat_Supply_Quantity,Animal fats_Fat_Supply_Quantity,"Aquatic Products, Other_Fat_Supply_Quantity",Cereals - Excluding Beer_Fat_Supply_Quantity,Eggs_Fat_Supply_Quantity,"Fish, Seafood_Fat_Supply_Quantity",Fruits - Excluding Wine_Fat_Supply_Quantity,Meat_Fat_Supply_Quantity,Miscellaneous_Fat_Supply_Quantity,...,Cassava and products_Protein_Supply_Quantity,Cloves_Protein_Supply_Quantity,Cocoa Beans and products_Protein_Supply_Quantity,Coconut Oil_Protein_Supply_Quantity,Coconuts - Incl Copra_Protein_Supply_Quantity,Honey_Protein_Supply_Quantity,Infant food_Protein_Supply_Quantity,Nuts and products_Protein_Supply_Quantity,Onions_Protein_Supply_Quantity,Sugar beet_Protein_Supply_Quantity
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,21.6397,6.2224,0.0,8.0353,0.6859,0.0327,0.4246,6.1244,0.0163,...,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000
Albania,0.0,32.0002,3.4172,0.0,2.6734,1.6448,0.1445,0.6418,8.7428,0.0170,...,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000
Algeria,0.0,14.4175,0.8972,0.0,4.2035,1.2171,0.2008,0.5772,3.8961,0.0439,...,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000
Angola,0.0,15.3041,1.3130,0.0,6.5545,0.1539,1.4155,0.3488,11.0268,0.0308,...,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000
Antigua and Barbuda,0.0,27.7033,4.6686,0.0,3.2153,0.3872,1.5263,1.2177,14.3202,0.0898,...,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venezuela (Bolivarian Republic of),-1.0,-1.0000,-1.0000,-1.0,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,...,27.4545,27.4545,27.4545,27.4545,27.4545,27.4545,27.4545,27.4545,27.4545,27.4545
Vietnam,-1.0,-1.0000,-1.0000,-1.0,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,...,29.5617,29.5617,29.5617,29.5617,29.5617,29.5617,29.5617,29.5617,29.5617,29.5617
Yemen,-1.0,-1.0000,-1.0000,-1.0,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,...,39.9831,39.9831,39.9831,39.9831,39.9831,39.9831,39.9831,39.9831,39.9831,39.9831
Zambia,-1.0,-1.0000,-1.0000,-1.0,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,-1.0000,...,40.1117,40.1117,40.1117,40.1117,40.1117,40.1117,40.1117,40.1117,40.1117,40.1117


In [72]:
for c in new_X.columns:
    assert new_X[c].dtype == 'float64'
        


In [95]:
from sklearn.decomposition import PCA
def apply_PCA(data, n_components=100):
    pca = PCA(n_components=n_components)
    pca.fit(data.to_numpy())
    
    print (pca.explained_variance_ratio_)
    return pca.transform(data)
    
    

In [96]:
pca_X = apply_PCA(new_X)

[4.04687942e-01 3.19543511e-01 2.47788437e-01 1.49650834e-02
 6.56003631e-03 2.88641114e-03 1.56978900e-03 2.63688764e-04
 2.32130564e-04 2.28338531e-04 1.88099647e-04 1.76999000e-04
 1.21688710e-04 1.19188481e-04 8.95680123e-05 8.48106810e-05
 7.29160420e-05 5.36545038e-05 4.99881844e-05 3.28898689e-05
 3.14498963e-05 3.00583472e-05 2.35785568e-05 2.20236499e-05
 2.19603620e-05 2.02994819e-05 1.66440946e-05 1.46182756e-05
 1.32074025e-05 1.21235442e-05 1.09333057e-05 8.62974528e-06
 7.64057342e-06 7.30825747e-06 5.43758395e-06 5.07242143e-06
 4.20966278e-06 2.88445081e-06 2.63515025e-06 2.60529265e-06
 2.39181561e-06 2.18874965e-06 2.16779203e-06 1.64322511e-06
 1.58481042e-06 1.41394306e-06 1.21186011e-06 1.13761012e-06
 9.48127427e-07 9.30555567e-07 6.59651892e-07 4.97316696e-07
 4.68644971e-07 4.63799420e-07 4.43053409e-07 3.68187547e-07
 3.10559179e-07 3.09300382e-07 2.55377644e-07 2.42265583e-07
 2.36780087e-07 2.11101934e-07 2.10985464e-07 2.04886211e-07
 1.79082302e-07 1.455198

In [98]:
pca_X.describe()

AttributeError: 'numpy.ndarray' object has no attribute 'describe'