In [1]:
import pandas as pd
import sklearn as sfs
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.append('..')
from data.dataset import DATASET as dt

In [2]:
dt.head()

Unnamed: 0,city,latitude,longitude,population,country_code,c40,risk0,risk1,risk2,risk3,...,SDG 6.4.1. Services Water Use Efficiency,SDG 6.4.1. Water Use Efficiency,SDG 6.4.2. Water Stress,Seasonal variability (WRI),Total internal renewable water resources per capita,Total population with access to safe drinking-water (JMP),Total renewable water resources per capita,Total water withdrawal per capita,Urban population with access to safe drinking-water (JMP),country
0,Aalborg,57.0337,9.9166,122219.0,DNK,False,,,,,...,558.335628,368.612902,20.040562,1.3,1046.705025,100.0,1046.705025,129.285516,100.0,Denmark
1,Aarhus,56.1572,10.2107,237551.0,DNK,False,,,,,...,558.335628,368.612902,20.040562,1.3,1046.705025,100.0,1046.705025,129.285516,100.0,Denmark
2,Copenhagen,55.6786,12.5635,1085000.0,DNK,False,,2.0,,2.0,...,558.335628,368.612902,20.040562,1.3,1046.705025,100.0,1046.705025,129.285516,100.0,Denmark
3,Esbjerg,55.467,8.45,72205.0,DNK,False,,,,,...,558.335628,368.612902,20.040562,1.3,1046.705025,100.0,1046.705025,129.285516,100.0,Denmark
4,Frederikshavn,57.4337,10.5333,24103.0,DNK,False,,2.0,,,...,558.335628,368.612902,20.040562,1.3,1046.705025,100.0,1046.705025,129.285516,100.0,Denmark


# Dataset splitting

In [3]:
import random

def data_splitting(dt,risk):
    # Select the columns containing labelled risk remove labels from the dataset to define training set
    train = dt[dt[risk].notnull()]
    y_train = train[risk] # define response variable
    # Remove labels from the dataset to define training set
    train = train[dt.columns.difference(dt.filter(like = 'risk').columns,sort=False)]
    # Remove categorical columns since they are only descriptive
    num_cols = train._get_numeric_data().columns
    to_drop = list(set(train.columns) - set(num_cols))
    to_drop.append("c40")
    train = train[train.columns.drop(to_drop)]
    # Define test set
    test = dt[~dt.index.isin(train.index)]
    test = test[test.columns.drop(to_drop)]
    test = test[test.columns.difference(test.filter(like = 'risk').columns,sort=False)]
    y_test = [random.randrange(4) for x in range(len(test))]
    return train, test, y_train, y_test

# Feature selection

## LASSO

In [4]:
from sklearn.linear_model import LassoCV

def lasso(X_train, y):
    lasso = LassoCV().fit(train, y)
    importance = np.abs(lasso.coef_)
    feature_names = np.array(train.columns)
    coef = pd.Series(lasso.coef_, index = train.columns)
    return coef[coef !=0]

In [5]:
risks = list(dt.filter(like='risk').columns)
for risk in risks:
    train, test, y_train, y_test = data_splitting(dt,risk)
    print(lasso(train,y_train))

population   -1.616826e-08
dtype: float64
population   -8.298024e-09
dtype: float64
Series([], dtype: float64)
Series([], dtype: float64)
population   -4.960183e-08
dtype: float64
population   -2.300692e-08
dtype: float64
Series([], dtype: float64)


In order to predict `risk0`, `risk1`, `risk4` and `risk5`, the LASSO method select only `population` as important regressor. The other risks cannot be predicted accurately on the basis of the given features.

## Boosting

In [6]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold
model = xgb.XGBRegressor()

### Feature selection
Boosting regression with cross validation in order to tune the hyperparameters.

In [7]:
def boosting_reg(train, y_train, risk, accuracy_scores):
    kfold = KFold(n_splits=10)
    reg_cv = GridSearchCV(model, cv = kfold,
                          param_grid = {"colsample_bytree":[0.1,0.5,1.0],"min_child_weight":[1.0,1.2],
                            'max_depth': [7,9], 'n_estimators': [500], "alpha": [10,12,15]})
    reg_cv.fit(train,y_train)
    print(reg_cv.best_params_)
    gbm = xgb.XGBRegressor(**reg_cv.best_params_)
    gbm.fit(train,y_train)
    accuracy_scores[risk]=[gbm.score(train,y_train),0]
    sorted_idx = np.argsort(gbm.feature_importances_)[::-1]
    best_features = list()
    for index in sorted_idx:
        if gbm.feature_importances_[index] > 0:
            best_features.append(train.columns[index]) 
    return gbm, best_features[:10], accuracy_scores

### Classification

In [8]:
def boosting_clas(gbm, test, y_test, risk, accuracy_scores):
    accuracy_scores[risk][1] = gbm.score(test,y_test)
    predictions = gbm.predict(test, iteration_range = (0, gbm.best_iteration))
    return predictions, accuracy_scores

### Generation of a reduced dataset filled with predictions

In [9]:
risks = list(dt.filter(like='risk').columns)
accuracy_scores = dict()
best_predictors = set()
filled_dataset = dt.copy()
for risk in risks:
    train, test, y_train, y_test = data_splitting(dt,risk)
    gbm, features, accuracy_scores = boosting_reg(train, y_train, risk, accuracy_scores)
    best_predictors.update(features)
    predictions, accuracy_scores = boosting_clas(gbm, test, y_test, risk, accuracy_scores)
    test_index = dt.index.isin(test.index)
    filled_dataset.loc[test_index, risk] = predictions.round()

{'alpha': 15, 'colsample_bytree': 0.1, 'max_depth': 9, 'min_child_weight': 1.0, 'n_estimators': 500}
{'alpha': 12, 'colsample_bytree': 0.1, 'max_depth': 9, 'min_child_weight': 1.2, 'n_estimators': 500}
{'alpha': 15, 'colsample_bytree': 0.1, 'max_depth': 7, 'min_child_weight': 1.0, 'n_estimators': 500}
{'alpha': 10, 'colsample_bytree': 0.1, 'max_depth': 9, 'min_child_weight': 1.0, 'n_estimators': 500}
{'alpha': 10, 'colsample_bytree': 1.0, 'max_depth': 9, 'min_child_weight': 1.2, 'n_estimators': 500}
{'alpha': 15, 'colsample_bytree': 0.1, 'max_depth': 7, 'min_child_weight': 1.0, 'n_estimators': 500}
{'alpha': 10, 'colsample_bytree': 1.0, 'max_depth': 7, 'min_child_weight': 1.0, 'n_estimators': 500}


In [10]:
# xgb.plot_importance(xg_reg, max_num_features = 15)
# plt.show()

In [20]:
fname = "boost_model.json"
# gbm.load_model(fname)
# gbm.save_model(fname)

In [21]:
best_predictors

{'Dependency ratio',
 'Exports and imports (% of GDP)',
 'Foreign direct investment, net inflows (% of GDP)',
 'Gross enrolment ratio, lower secondary, female (%)',
 'Gross enrolment ratio, lower secondary, male (%)',
 'Gross enrolment ratio, pre-primary, both sexes (%)',
 'Gross enrolment ratio, pre-primary, male (%)',
 'Gross enrolment ratio, primary, gender parity index (GPI)',
 'Gross enrolment ratio, secondary, gender parity index (GPI)',
 'Gross enrolment ratio, upper secondary, both sexes (%)',
 'Gross intake ratio to Grade 1 of primary education, female (%)',
 'Industrial water withdrawal as % of total water withdrawal',
 'Inequality-adjusted education index',
 'Interannual variability (WRI)',
 'Internet users (per 100 people)',
 'Labor force, female (% of total labor force)',
 'Life expectancy at birth, female (years)',
 'MDG 7.5. Freshwater withdrawal as % of total renewable water resources',
 'Mortality rate, under-5 (per 1,000 live births)',
 'Official entrance age to lower

In [22]:
accuracy_scores

{'risk0': [0.18915455667098258, -0.4752705173644811],
 'risk1': [0.260631555092647, -0.12187337229118933],
 'risk2': [0.18236664235533662, -0.24174019440342476],
 'risk3': [0.4022502318732596, -0.16045538395397685],
 'risk4': [0.45794451853249785, -0.23728466726676234],
 'risk5': [-9.104151493066759e-05, -0.7872117327277512],
 'risk6': [0.11572841802066092, -0.9599352484991102]}

In [23]:
get_back = ["city", "country"]
to_drop = set(dt.columns) - best_predictors - set(get_back)
reduced_dataset = dt[dt.columns.drop(to_drop)]

In [24]:
reduced_dataset.head()

Unnamed: 0,city,latitude,longitude,population,Population with at least some secondary education (% ages 25 and older),Total population (millions),Sex ratio at birth (male to female births),"Foreign direct investment, net inflows (% of GDP)",Population ages 15?64 (millions),Inequality-adjusted education index,...,"Prevalence of HIV, total (% of population ages 15-49)","Primary completion rate, both sexes (%)","Primary completion rate, female (%)","Unemployment, male (% of male labor force) (modeled ILO estimate)",Dependency ratio,Industrial water withdrawal as % of total water withdrawal,Interannual variability (WRI),MDG 7.5. Freshwater withdrawal as % of total renewable water resources,SDG 6.4.2. Water Stress,country
0,Aalborg,57.0337,9.9166,122219.0,90.949,5.772,1.06,-1.859,3.673,0.894,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
1,Aarhus,56.1572,10.2107,237551.0,90.949,5.772,1.06,-1.859,3.673,0.894,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
2,Copenhagen,55.6786,12.5635,1085000.0,90.949,5.772,1.06,-1.859,3.673,0.894,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
3,Esbjerg,55.467,8.45,72205.0,90.949,5.772,1.06,-1.859,3.673,0.894,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
4,Frederikshavn,57.4337,10.5333,24103.0,90.949,5.772,1.06,-1.859,3.673,0.894,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark


In [25]:
# reduced_dataset.to_csv('reduced_dataset.csv',index=False)

In [26]:
to_drop = to_drop - set(dt.filter(like = 'risk').columns)
filled_red_dataset = filled_dataset[dt.columns.drop(to_drop)]

In [27]:
# filled_red_dataset.to_csv('filled_red_dataset.csv',index=False)

In [28]:
filled_red_dataset.head()

Unnamed: 0,city,latitude,longitude,population,risk0,risk1,risk2,risk3,risk4,risk5,...,"Prevalence of HIV, total (% of population ages 15-49)","Primary completion rate, both sexes (%)","Primary completion rate, female (%)","Unemployment, male (% of male labor force) (modeled ILO estimate)",Dependency ratio,Industrial water withdrawal as % of total water withdrawal,Interannual variability (WRI),MDG 7.5. Freshwater withdrawal as % of total renewable water resources,SDG 6.4.2. Water Stress,country
0,Aalborg,57.0337,9.9166,122219.0,0.0,2.0,2.0,2.0,0.0,0.0,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
1,Aarhus,56.1572,10.2107,237551.0,0.0,2.0,2.0,2.0,0.0,0.0,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
2,Copenhagen,55.6786,12.5635,1085000.0,0.0,2.0,2.0,2.0,0.0,0.0,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
3,Esbjerg,55.467,8.45,72205.0,0.0,2.0,2.0,2.0,0.0,0.0,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
4,Frederikshavn,57.4337,10.5333,24103.0,0.0,2.0,2.0,2.0,0.0,0.0,...,-1.764496,98.799316,98.29921,5.9,0.0,4.439347,1.0,12.351667,20.040562,Denmark
