In [283]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, median_absolute_error, r2_score
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, ElasticNetCV
from sklearn.feature_selection import RFE
from scipy.stats import mannwhitneyu

In [68]:
merged = pd.read_csv('../../data/converted/merged.csv')
df = merged.copy()
# remove all data prior to 2000 - considering up to 19 years of data before the gii score
df.drop(df[df.year < 2000].index, inplace = True)

In [69]:
# calculate feature mean over all years available (using mean to include outlier effects) 
df_mean = df.groupby('country_iso', as_index=False).mean()
# remove year column
df_mean.drop(['year'], axis = 1, inplace = True)

In [70]:
df_gii = df_mean.dropna(subset=['gii_innovation_output'])

In [77]:
df_creative = df_gii[['country_iso',
                    'gii_human_capital',
                    'gii_scientific_publications',
                    'rd_in_gdp',
                    'creative_svc_audiovisual',
                    'creative_svc_other_personal_cultural_recreational',
                    'creative_svc_advertising_mktresearch_polling',
                    'creative_svc_architectural_engineering_technical',
                    'creative_svc_personal_cultural_recreational',
                    'creative_svc_research',
                    'gii_creative_services', 
                    'gii_patent_applications',
                    'feature_films_produced',
                    'gii_patent_families',
                    'gii_mobile_apps',
                    'cultural_occupation',
                    'gii_creative_goods',
                    'gii_wikipedia_edits',
                    'gii_innovation_output',
                    ]]

In [86]:
creative_svc = df_creative.columns.str.contains('creative_svc')
df_creative.columns[creative_svc]

Index(['creative_svc_audiovisual',
       'creative_svc_other_personal_cultural_recreational',
       'creative_svc_advertising_mktresearch_polling',
       'creative_svc_architectural_engineering_technical',
       'creative_svc_personal_cultural_recreational', 'creative_svc_research'],
      dtype='object')

In [87]:
df_clean = df_creative.dropna(subset=df_creative.columns[creative_svc])

In [143]:
df_predict = df_gii[df_gii.columns[(df_gii.count()/df_gii.index.size)>=0.95].values]

In [144]:
df_predict.count()

country_iso                              129
gii_human_capital                        129
gii_domestic_credit                      126
mobile_subscriptions                     129
broadband_subscriptions_per100           129
gii_ict_services_imports                 128
diversity_ethnicFractionalization        127
diversity_linguisticFractionalization    127
diversity_religiousFractionalization     127
electrification                          129
gii_scientific_publications              128
rural_population                         126
school_enrollment_tertiary               123
gii_rule_of_law                          129
population                               129
mortality_rate                           128
secure_internet_servers                  129
gii_institutions                         129
gii_top_level_domains                    129
gii_patent_applications                  126
gii_patent_families                      123
gii_creative_goods                       128
life_expec

In [145]:
df_predict = df_predict.fillna(df_predict.median(axis = 0))

In [146]:
df_predict.count()

country_iso                              129
gii_human_capital                        129
gii_domestic_credit                      129
mobile_subscriptions                     129
broadband_subscriptions_per100           129
gii_ict_services_imports                 129
diversity_ethnicFractionalization        129
diversity_linguisticFractionalization    129
diversity_religiousFractionalization     129
electrification                          129
gii_scientific_publications              129
rural_population                         129
school_enrollment_tertiary               129
gii_rule_of_law                          129
population                               129
mortality_rate                           129
secure_internet_servers                  129
gii_institutions                         129
gii_top_level_domains                    129
gii_patent_applications                  129
gii_patent_families                      129
gii_creative_goods                       129
life_expec

In [147]:
target = df_predict[['gii_innovation_output']]
df_predict.drop(columns=['gii_innovation_output', 'global_innovation_index', 'country_iso'], inplace=True)

In [148]:
X_train, X_test, y_train, y_test = train_test_split(df_predict, target, test_size=0.25, random_state=42)

In [149]:
clf = RandomForestRegressor(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [150]:
y_predict = clf.predict(X_test)

In [151]:
mean_absolute_error(y_test, y_predict)

4.327769150842988

In [154]:
target['gii_innovation_output'].describe()

count    129.000000
mean      27.278295
std       12.683121
min        6.400000
25%       17.700000
50%       23.500000
75%       35.600000
max       63.500000
Name: gii_innovation_output, dtype: float64

In [155]:
y_predict_insample = clf.predict(X_train)

In [156]:
mean_absolute_error(y_train, y_predict_insample)

3.5440551313355235

In [158]:
median_absolute_error(y_train, y_predict_insample)

3.2027055569947356

In [159]:
median_absolute_error(y_test, y_predict)

3.9441501370984895

In [163]:
r2_score(y_train, y_predict_insample)

0.8744593939458895

In [174]:
features = pd.Series(clf.feature_importances_, df_predict.columns)

In [176]:
features.sort_values(ascending=False)

gii_patent_families                      0.480469
broadband_subscriptions_per100           0.331990
secure_internet_servers                  0.061532
gii_university_industry                  0.046316
gii_human_capital                        0.028398
gii_wikipedia_edits                      0.017147
gii_patent_applications                  0.014874
mobile_subscriptions                     0.010941
gii_creative_goods                       0.008333
gii_ict_services_imports                 0.000000
rural_population                         0.000000
diversity_ethnicFractionalization        0.000000
diversity_linguisticFractionalization    0.000000
diversity_religiousFractionalization     0.000000
gii_domestic_credit                      0.000000
electrification                          0.000000
gii_scientific_publications              0.000000
gdp_per_capta_usd                        0.000000
school_enrollment_tertiary               0.000000
gii_rule_of_law                          0.000000


In [235]:
from sklearn.preprocessing import StandardScaler
linear = Lasso()
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

linear.fit(X_train_scaled, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [236]:
y_predict_insample = linear.predict(scaler.transform(X_train))

In [237]:
median_absolute_error(y_train, y_predict_insample)

2.5651159352986284

In [238]:
y_predict = linear.predict(scaler.transform(X_test))
median_absolute_error(y_test, y_predict)

4.184744461907044

In [239]:
features = pd.Series(linear.coef_, df_predict.columns)

In [240]:
features.sort_values(ascending=False)

gii_human_capital                        2.040268
secure_internet_servers                  1.645089
gii_creative_goods                       1.627713
gii_scientific_publications              1.449664
broadband_subscriptions_per100           1.301935
gii_university_industry                  1.275459
gii_top_level_domains                    1.110268
gii_wikipedia_edits                      1.050898
gii_patent_applications                  1.027994
gii_rule_of_law                          0.626620
electrification                          0.553245
gii_patent_families                      0.385486
population                               0.310550
gii_ict_services_imports                 0.173078
gii_domestic_credit                      0.151501
mortality_rate                          -0.000000
gii_institutions                         0.000000
life_expectancy                          0.000000
school_enrollment_tertiary               0.000000
rural_population                        -0.000000


In [241]:
linear.n_iter_

119

In [246]:
X = df_predict
y = target
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 7)
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
model.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)

[False False False  True  True  True  True False False False False False
  True False False False False False False False  True False False False
 False False  True False]
[ 9 16 17  1  1  1  1 15  5  4 13 18  1 22 11 19  8 14  6  2  1 12 20 10
  3  7  1 21]


  y = column_or_1d(y, warn=True)


In [247]:
#no of features
nof_list=np.arange(1,13)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

Optimum number of features: 8
Score with 8 features: 0.797479


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [248]:
cols = list(X.columns)
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 8)             
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

Index(['broadband_subscriptions_per100', 'gii_ict_services_imports',
       'diversity_ethnicFractionalization',
       'diversity_linguisticFractionalization', 'gii_rule_of_law',
       'gii_patent_families', 'gii_creative_goods', 'gii_university_industry'],
      dtype='object')


  y = column_or_1d(y, warn=True)


In [249]:
model.coef_

array([[ 0.64250925,  0.96005721, -4.17234294,  2.90313337,  1.68583918,
         0.61572189,  1.19037247,  2.15837899]])

In [250]:
rfe.support_

array([False, False, False,  True,  True,  True,  True, False, False,
       False, False, False,  True, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False,  True,
       False])

In [255]:
predict = model.predict(X[selected_features_rfe])

In [257]:
median_absolute_error(y, predict)

2.9186681280471625

In [275]:
df_prediction = df_gii[['country_iso', 'gii_innovation_output']]

In [278]:
df_prediction.prediction = predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [280]:
df_prediction

Unnamed: 0,country_iso,gii_innovation_output,prediction
3,ALB,18.3,21.008432
7,ARE,26.7,31.228595
8,ARG,21.6,23.478069
9,ARM,28.6,20.765648
12,AUS,36.3,37.625650
...,...,...,...
261,VNM,33.9,27.027742
266,YEM,6.4,12.623476
267,ZAF,22.3,21.693349
268,ZMB,12.7,15.822434


In [282]:
median_absolute_error(df_prediction.gii_innovation_output, df_prediction.prediction)

2.9186681280471625

In [288]:
df_prediction = df_prediction.set_index('country_iso')

In [293]:
predict_rank = df_prediction.sort_values('prediction', ascending=False).index
orig_rank = df_prediction.sort_values('gii_innovation_output', ascending=False).index

In [301]:
mannwhitneyu(orig_rank, predict_rank)

MannwhitneyuResult(statistic=8320.5, pvalue=0.499667155919561)

In [302]:
df_prediction.to_csv(')

Unnamed: 0_level_0,gii_innovation_output,prediction
country_iso,Unnamed: 1_level_1,Unnamed: 2_level_1
ALB,18.3,21.008432
ARE,26.7,31.228595
ARG,21.6,23.478069
ARM,28.6,20.765648
AUS,36.3,37.625650
...,...,...
VNM,33.9,27.027742
YEM,6.4,12.623476
ZAF,22.3,21.693349
ZMB,12.7,15.822434


In [303]:
df_prediction.to_csv('linear_regression.csv')