In [72]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn import linear_model

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
offset = 1000000  
sat = pd.read_csv("data/egdp/satelite_data.csv")
sat["lnEGDPpc2012"] = np.log((sat["egdp2012"]/ bol["pop2012"])+offset)


df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

# **Regression models**

### Linear model

In [71]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 29.896196696846875
R^2 Score: 0.4217531583397731


### Ridge  model

In [73]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

rid_model = linear_model.Ridge()
rid_model.fit(x_train, y_train)
rid_model.score(x_test, y_test)

0.44095785688351774

# **Categorical analysis**

In [44]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)
#df["imds_quantile"]

In [45]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [46]:
#clf.get_params()

In [47]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 47.06%


In [48]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.67      0.71      0.69        14
         low       0.50      0.71      0.59        17
 medium-high       0.20      0.23      0.21        13
  medium-low       0.50      0.29      0.37        24

    accuracy                           0.47        68
   macro avg       0.47      0.49      0.46        68
weighted avg       0.48      0.47      0.46        68



In [49]:
print(confusion_matrix(y_test, y_preds))

[[10  1  2  1]
 [ 0 12  3  2]
 [ 3  3  3  4]
 [ 2  8  7  7]]


In [50]:
accuracy_score(y_test, y_preds)

0.47058823529411764

In [51]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 39.71%
Trying model with 20 stimators
Model acurracy is 41.18%
Trying model with 30 stimators
Model acurracy is 44.12%
Trying model with 40 stimators
Model acurracy is 44.12%
Trying model with 50 stimators
Model acurracy is 45.59%
Trying model with 60 stimators
Model acurracy is 47.06%
Trying model with 70 stimators
Model acurracy is 42.65%
Trying model with 80 stimators
Model acurracy is 42.65%
Trying model with 90 stimators
Model acurracy is 38.24%


In [52]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

#load_model = pickle.load(open("random_forest_model.pkl", "rb"))
#load_model.score(x_test, y_test)

## Iterate regression models

In [53]:
y_variables = list(df.columns)
del y_variables[195:424]
del y_variables[10:13]
del y_variables[0:8]

In [66]:
#y_variables

In [63]:
model_results = {}
    
#x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
#x = df[["ln_t400NTLpc2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
x = df[[ "ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'land_temp2012']].fillna(0)

for y_variable in y_variables:
    y = df[y_variable].fillna(0)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

res_90 = {key: value for key, value in model_results.items() if value > 0.9}
res_80 = {key: value for key, value in model_results.items() if (value > 0.8 and value < 0.90)}
res_neg = {key: value for key, value in model_results.items() if value < 0}

#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)


In [64]:
res_90

{'sdg1_1_pubn_abs': 0.9146682096187452,
 'sdg2_2_wow_abs': 0.9458431981785772,
 'sdg4_1_fhs_abs': 0.9717899515091252,
 'sdg4_c_uts_abs': 0.944975205773535,
 'sdg8_5_ompr_abs': 0.9620865866563659,
 'sdg8_5_ofpr_abs': 0.9971758003758835,
 'sdg8_6_mlm_abs': 0.9095187207014701,
 'sdg8_6_wlm_abs': 0.9375242143310277,
 'sdg8_10_dbb_abs': 0.9124798028402046,
 'sdg9_5_eutf_abs': 0.922623205282282,
 'sdg11_1_ho_abs': 0.9002560342764016,
 'sdg16_6_aob_abs': 0.9516336271597378}

In [65]:
res_80

{'sdg2_2_cm_abs': 0.8528898126709923,
 'sdg3_2_fb_abs': 0.812640109718576,
 'sdg3_2_ffb_abs': 0.8092096337038548,
 'sdg3_7_bpw_abs': 0.8985835888868896,
 'sdg4_1_mhs_abs': 0.8598807471536004,
 'sdg4_c_uti_abs': 0.8458772318593852,
 'sdg7_3_tee_abs': 0.8705765674896884,
 'sdg9_c_hf_abs': 0.8448418904158453,
 'sdg9_c_tr_abs': 0.8171716168302303,
 'sdg17_5_tpi_abs': 0.8145462698027061}

In [62]:
res_neg

{'sdg2_4_td': -0.02667136590448682,
 'sdg3_3_di': -0.010382070649102904,
 'sdg3_3_imr': -0.015754645651214316,
 'sdg4_1_ssdrf': -0.16143027838705826,
 'sdg5_1_gpsd': -0.17814339873319462,
 'sdg6_2_sc': -0.07706637817898754,
 'sdg6_3_wwt': -3.287312992238123,
 'sdg9_5_eutf': -0.00011441522939326099,
 'sdg16_1_rhr': -0.19999848214355542,
 'sdg17_5_pipc': -0.017261130039780692,
 'sdg1_2_dd_abs': -0.009914493000567015,
 'sdg2_4_tm_abs': -0.19752795485258434,
 'sdg3_3_cd_abs': -0.7904091854170101,
 'sdg3_3_pd_abs': -0.3330334090440388,
 'sdg3_3_mc_abs': -0.5711461012155026,
 'sdg6_2_bsc_abs': -3.8033791902836445,
 'sdg13_2_tco2_abs': -0.2722820680480842,
 'sdg13_2_ad_abs': -0.36974000361541215,
 'sdg15_5_rl_abs': -0.14532560453378562,
 'sdg8_6_mlm_norm': -0.09238309332760886,
 'sdg9_5_eutf_norm': -0.07457952654411026,
 'sdg13_2_tco2e_norm': -0.06046627291112494,
 'sdg15_5_blr_norm': -0.03777328440006156,
 'sdg17_5_pipc_norm': -0.03496605557323651}

In [20]:
#y_variables.index('perUrb_land')

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)