## Unsupervised Classification Analysis
### Marc Boulet
### Team Transparency, CSE 6242 Spring 2020


This notebook will take Team Transparency's merged dataset of country features and attempt to rank countries through unsupervised methods:
- k-means clustering  
- hierarchical clustering  
- DBSCAN  
- PCA of grouped attributes

In [37]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
# from sklearn.cluster import KMeans
# from sklearn import metrics
# from sklearn.cluster import DBSCAN
# from sklearn.cluster import AgglomerativeClustering
# from scipy.cluster.hierarchy import linkage, dendrogram

In [104]:
# load data
merged = pd.read_csv('../data/converted/merged.csv')
df = merged.copy()
df.head()

Unnamed: 0,year,gii_human_capital,country_iso,gii_domestic_credit,mobile_subscriptions,broadband_subscriptions_per100,gii_ict_services_imports,diversity_ethnicFractionalization,diversity_linguisticFractionalization,diversity_religiousFractionalization,...,gii_wikipedia_edits,gii_ict_access,gii_stem_assessment,poverty_ratio,foreign_investment,total_hospital_density_per_100k,gii_university_industry,gii_innovation_output,gdp_per_capta_usd,global_innovation_index
0,1960,,ABW,,0.0,,,,,,...,,,,,,,,,,
1,1961,,ABW,,,,,,,,...,,,,,,,,,,
2,1962,,ABW,,,,,,,,...,,,,,,,,,,
3,1963,,ABW,,,,,,,,...,,,,,,,,,,
4,1964,,ABW,,,,,,,,...,,,,,,,,,,


### Data conditioning  

In [105]:
# remove all years prior to 2000
#df.drop(df[df.year < 2000].index, inplace = True)

In [106]:
# calculate feature mean over all years (want to include outlier effects) 
df_mean = df.groupby('country_iso', as_index=False).mean()
# remove year column
df_mean.drop(['year'], axis =1, inplace = True)

In [111]:
# impute nulls with median feature value
df_mean.fillna(df_mean.median(axis = 0), inplace = True)

In [114]:
# number of nulls per country
#df_mean.isnull().sum(axis=0)

In [103]:
# Write to .csv file
df_mean.to_csv('../data/converted/df_mean.csv',index=False)

### k-means clustering

### divide features into cultural and non-cultural factors

In [91]:
non_cultural = df_mean[['country_iso',
                        'gii_domestic_credit',
                        'mobile_subscriptions',
                        'broadband_subscriptions_per100',
                        'gii_ict_services_imports',
                        'diversity_ethnicFractionalization',
                        'diversity_linguisticFractionalization',
                        'diversity_religiousFractionalization',
                        'literacy_rate',
                        'electrification',
                        'rural_population',
                        'school_enrollment_tertiary',
                        'gii_rule_of_law',
                        'population',
                        'mortality_rate',
                        'secure_internet_servers',
                        'gii_institutions',
                        'gii_top_level_domains',
                        'gii_research_talent_in_business',
                        'life_expectancy',
                        'ease_of_business',
                        'gii_ict_access',
                        'gii_stem_assessment',
                        'poverty_ratio',
                        'foreign_investment',
                        'total_hospital_density_per_100k',
                        'gii_university_industry',
                        'gdp_per_capta_usd', 
                        ]]
non_cultural.shape

(271, 28)

In [92]:
cultural = df_mean[['country_iso',
                    'gii_human_capital',
                    'gii_scientific_publications',
                    'rd_in_gdp',
                    'creative_svc_audiovisual',
                    'creative_svc_other_personal_cultural_recreational',
                    'creative_svc_advertising_mktresearch_polling',
                    'creative_svc_architectural_engineering_technical',
                    'creative_svc_personal_cultural_recreational',
                    'creative_svc_research',
                    'gii_creative_services', 
                    'gii_patent_applications',
                    'feature_films_produced',
                    'gii_patent_families',
                    'gii_mobile_apps',
                    'cultural_occupation',
                    'gii_creative_goods',
                    'gii_wikipedia_edits',
                    'gii_innovation_output',
                    'global_innovation_index'
                    ]]
cultural.shape

(271, 20)

In [None]:
noncu