In [1]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pylab as plt

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
sat = pd.read_csv("data/egdp/satelite_data.csv")
df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

In [3]:
x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 32.865024127297296
R^2 Score: 0.3491410136778478


# **Categorical analysis**

In [5]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)
#df["imds_quantile"]

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [7]:
#clf.get_params()

In [8]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 34.31%


In [9]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.57      0.61      0.59        28
         low       0.22      0.56      0.32        16
 medium-high       0.26      0.25      0.26        24
  medium-low       0.38      0.09      0.14        34

    accuracy                           0.34       102
   macro avg       0.36      0.38      0.33       102
weighted avg       0.38      0.34      0.32       102



In [10]:
print(confusion_matrix(y_test, y_preds))

[[17  4  5  2]
 [ 1  9  6  0]
 [ 8  7  6  3]
 [ 4 21  6  3]]


In [11]:
accuracy_score(y_test, y_preds)

0.3431372549019608

In [12]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 34.31%
Trying model with 20 stimators
Model acurracy is 36.27%
Trying model with 30 stimators
Model acurracy is 32.35%
Trying model with 40 stimators
Model acurracy is 34.31%
Trying model with 50 stimators
Model acurracy is 34.31%
Trying model with 60 stimators
Model acurracy is 33.33%
Trying model with 70 stimators
Model acurracy is 33.33%
Trying model with 80 stimators
Model acurracy is 35.29%
Trying model with 90 stimators
Model acurracy is 33.33%


In [13]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

#load_model = pickle.load(open("random_forest_model.pkl", "rb"))
#load_model.score(x_test, y_test)

## Iterate regression models

In [14]:
y_variables = list(df.columns)
del y_variables[195:419]
del y_variables[10:13]
del y_variables[0:8]

#y_variables

In [15]:
model_results = {}
    
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'egdp2012', 'perUrb_land']].fillna(0)

for y_variable in y_variables:
    y = df[y_variable].fillna(0)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

res_90 = {key: value for key, value in model_results.items() if value > 0.9}
res_80 = {key: value for key, value in model_results.items() if (value > 0.8 and value < 0.90)}
#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)


In [16]:
res_90

{'sdg2_2_wow_abs': 0.9516215699996706,
 'sdg3_3_vih_abs': 0.9235017472347284,
 'sdg4_1_fhs_abs': 0.9566842984332434,
 'sdg4_4_heu_abs': 0.9397847049863787,
 'sdg8_4_rem_abs': 0.9061578006649388,
 'sdg8_5_ompr_abs': 0.9723028202255143,
 'sdg8_5_ofpr_abs': 0.9948774198143541,
 'sdg8_6_wlm_abs': 0.9377420841181799,
 'sdg16_6_aob_abs': 0.9291600333773501}

In [17]:
res_80

{'sdg2_2_cm_abs': 0.8450838965192334,
 'sdg3_1_udhf_ab': 0.8720049450484189,
 'sdg3_7_bpw_abs': 0.8245027533870777,
 'sdg4_1_mhs_abs': 0.8046036019823462,
 'sdg8_6_mlm_abs': 0.8747228681926947,
 'sdg8_10_dbb_abs': 0.8922257665087397,
 'sdg11_1_ho_abs': 0.8378041647789507}

In [18]:
#y_variables.index('perUrb_land')

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)