In [1]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
offset = 1000000  
sat = pd.read_csv("data/egdp/satelite_data.csv")
sat["lnEGDPpc2012"] = np.log((sat["egdp2012"]/ bol["pop2012"])+offset)


df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

# **Regression models**

### Linear model

In [3]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

model = LinearRegression()
model.fit(x_train,y_train)
lr = model.score(x_test, y_test)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#print("Mean Squared Error:", mse)
#print("R^2 Score:", r2)

### Ridge  model

In [4]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

rid_model = linear_model.Ridge()
rid_model.fit(x_train, y_train)
ri = rid_model.score(x_test, y_test)

### Random Forest Regressor model

In [5]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)
rf = rf_model.score(x_test, y_test)

### Results 

In [6]:
print("R2 for the regressor models")
print(f" Linear model: {lr*100:.2f}%")
print(f" Ridger model: {ri*100:.2f}%")
print(f" Random Forest model: {rf*100:.2f}%")

R2 for the regressor models
 Linear model: 42.18%
 Ridger model: 44.10%
 Random Forest model: 41.86%


# **Categorical analysis**

In [7]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)
#df["imds_quantile"]

In [8]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds_quantile']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

clf = RandomForestClassifier()
#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

#evaluate the model
#print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")
rf_cl = clf.score(x_test, y_test)

 Test data score: 45.59%


In [9]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.57      0.59      0.58        22
         low       0.41      0.50      0.45        14
 medium-high       0.36      0.33      0.34        15
  medium-low       0.43      0.35      0.39        17

    accuracy                           0.46        68
   macro avg       0.44      0.44      0.44        68
weighted avg       0.45      0.46      0.45        68



In [10]:
print(confusion_matrix(y_test, y_preds))

[[13  2  5  2]
 [ 1  7  3  3]
 [ 5  2  5  3]
 [ 4  6  1  6]]


In [11]:
accuracy_score(y_test, y_preds)

0.45588235294117646

In [12]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 41.18%
Trying model with 20 stimators
Model acurracy is 39.71%
Trying model with 30 stimators
Model acurracy is 41.18%
Trying model with 40 stimators
Model acurracy is 41.18%
Trying model with 50 stimators
Model acurracy is 41.18%
Trying model with 60 stimators
Model acurracy is 42.65%
Trying model with 70 stimators
Model acurracy is 45.59%
Trying model with 80 stimators
Model acurracy is 39.71%
Trying model with 90 stimators
Model acurracy is 39.71%


In [13]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

#load_model = pickle.load(open("random_forest_model.pkl", "rb"))
#load_model.score(x_test, y_test)

### Linear SVC Classifier

In [14]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds_quantile']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

sgd_cla = svm.LinearSVC(dual="auto", max_iter=10000)
sgd_cla.fit(x_train, y_train)
sgd_cl = sgd_cla.score(x_test, y_test)

### Naive Bayes Classifier

In [15]:
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'pm25_2012', 'land_temp2012']].fillna(0)
y = df['imds_quantile']

np.random.seed(15)
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

naive_cla = GaussianNB()
naive_cla.fit(x_train, y_train)
naive_cl = naive_cla.score(x_test, y_test)

### Results 

In [16]:
print("R2 for the classifier models")
print(f" Random Forest model: {rf_cl*100:.2f}%")
print(f" Linear SVC model: {sgd_cl*100:.2f}%")
print(f" Naive Bayes model: {naive_cl*100:.2f}%")

R2 for the classifier models
 Random Forest model: 45.59%
 Linear SVC model: 41.18%
 Naive Bayes model: 30.88%


## Iterate regression models

In [17]:
y_variables = list(df.columns)
del y_variables[195:424]
del y_variables[10:13]
del y_variables[0:8]

In [18]:
sdy_variables

NameError: name 'sdy_variables' is not defined

In [22]:
xs = [ "ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'land_temp2012']
for x in xs:
    df.fillna({x:0}, inplace=True)

In [43]:
model_results = {}

for y_variable in y_variables:
    filtered_df = df.dropna()

    x = filtered_df[[ "ln_t400NTLpc2012", "tr400_pop2012", 'lnEGDPpc2012', 'perUrb_land', 'land_temp2012']]
    y = filtered_df[y_variable]

    np.random.seed(15)

    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

res_90 = {key: value for key, value in model_results.items() if value > 0.9}
res_80 = {key: value for key, value in model_results.items() if (value > 0.8 and value < 0.9)}
res_70 = {key: value for key, value in model_results.items() if (value > 0.7 and value < 0.8)}

res_neg = {key: value for key, value in model_results.items() if value < 0}

#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)


In [44]:
res_90

{'sdg8_5_ompr_abs': 0.9341185593360726, 'sdg8_5_ofpr_abs': 0.9519549494760443}

In [45]:
res_80

{'sdg2_2_wow_abs': 0.8628762314870526,
 'sdg3_7_bpw_abs': 0.8112471053554561,
 'sdg4_1_mhs_abs': 0.8111023629414181,
 'sdg7_1_rec_abs': 0.8113053574601856,
 'sdg8_6_wlm_abs': 0.8947805687515471,
 'sdg9_c_tr_abs': 0.8119604879077338,
 'sdg11_1_ho_abs': 0.862092369505172,
 'sdg16_6_aob_abs': 0.8077023566105892}

In [46]:
res_70

{'sdg8_11_idi': 0.751441455051361,
 'sdg1_1_pubn_abs': 0.7654600460488963,
 'sdg3_2_fb_abs': 0.7617195025751081,
 'sdg3_2_ffb_abs': 0.7452169894765497,
 'sdg4_1_fhs_abs': 0.7357615851888066,
 'sdg4_c_uts_abs': 0.7456802513545902,
 'sdg8_6_mlm_abs': 0.704232058310163,
 'sdg17_5_tpi_abs': 0.7108509475228498,
 'sdg8_11_idi_norm': 0.7471981883618837}

In [47]:
res_neg

{'sdg3_3_cdir': -0.06148009131876453,
 'sdg3_3_di': -2.1739513773862713,
 'sdg3_3_imr': -1.1027612470542594,
 'sdg8_10_dbb': -0.013580210832742079,
 'sdg9_5_eutf': -0.06579332152606177,
 'sdg3_3_cd_abs': -3.553670745310866,
 'sdg3_3_pd_abs': -25.35935632078406,
 'sdg3_3_mc_abs': -42.40128390172487,
 'sdg4_4_heu_abs': -0.6996225782879184,
 'sdg6_1_wdc_abs': -0.1193598531096065,
 'sdg9_5_kcd_abs': -0.32280381986038376,
 'sdg3_3_cdir_norm': -0.12838879472530307,
 'sdg3_3_imr_norm': -0.6669060020043927,
 'sdg8_10_dbb_norm': -0.012441869920378101,
 'sdg9_5_cd_norm': -0.00640983934276429,
 'sdg9_5_eutf_norm': -0.06097259686322554,
 'index_sdg4': -0.06072464970814706}

In [None]:
#y_variables.index('perUrb_land')

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)