In [1]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pylab as plt

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
sat = pd.read_csv("data/egdp/satelite_data.csv")
df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

In [3]:
x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 27.805959177247253
R^2 Score: 0.4576267210326871


## Iterate regression models

In [5]:
y_variables = list(df.columns)
del y_variables[195:418]
del y_variables[10:13]
del y_variables[0:8]



In [20]:
#y_variables

In [7]:
model_results = {}
    

x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'egdp2012', 'perUrb_land']].fillna(0)

for y_variable in y_variables:
    y = df[y_variable].fillna(0)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

res_90 = {key: value for key, value in model_results.items() if value > 0.9}
res
#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)

In [8]:
res_90


{'sdg1_1_dtl_abs': 0.951865228762999,
 'sdg2_2_wow_abs': 0.9398237052059338,
 'sdg3_1_udhf_ab': 0.9294290277330215,
 'sdg4_1_mhs_abs': 0.9928815108531338,
 'sdg4_4_heu_abs': 0.9624975551024469,
 'sdg8_5_ompr_abs': 0.9801671883481403,
 'sdg8_5_ofpr_abs': 0.9966704122883959,
 'sdg8_6_mlm_abs': 0.9772321302304099,
 'sdg8_6_wlm_abs': 0.9853541371092804,
 'sdg8_10_dbb_abs': 0.923882000060269,
 'sdg11_1_ho_abs': 0.9375358194655883,
 'sdg16_6_aob_abs': 0.9537581464230923,
 'sdg17_5_tpi_abs': 0.9604753066648657}

In [9]:
#y_variables.index('perUrb_land')

# **Categorical analysis**

In [10]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)

In [11]:
df["imds_quantile"]

0      medium-high
1       medium-low
2       medium-low
3             high
4              low
          ...     
334           high
335     medium-low
336    medium-high
337           high
338           high
Name: imds_quantile, Length: 339, dtype: category
Categories (4, object): ['low' < 'medium-low' < 'medium-high' < 'high']

In [12]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [13]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [14]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 38.24%


In [15]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.52      0.52      0.52        23
         low       0.42      0.65      0.52        26
 medium-high       0.32      0.27      0.29        26
  medium-low       0.18      0.11      0.14        27

    accuracy                           0.38       102
   macro avg       0.36      0.39      0.37       102
weighted avg       0.35      0.38      0.36       102



In [16]:
print(confusion_matrix(y_test, y_preds))

[[12  3  6  2]
 [ 2 17  1  6]
 [ 6  7  7  6]
 [ 3 13  8  3]]


In [17]:
accuracy_score(y_test, y_preds)

0.38235294117647056

In [18]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 40.20%
Trying model with 20 stimators
Model acurracy is 40.20%
Trying model with 30 stimators
Model acurracy is 33.33%
Trying model with 40 stimators
Model acurracy is 41.18%
Trying model with 50 stimators
Model acurracy is 34.31%
Trying model with 60 stimators
Model acurracy is 35.29%
Trying model with 70 stimators
Model acurracy is 36.27%
Trying model with 80 stimators
Model acurracy is 36.27%
Trying model with 90 stimators
Model acurracy is 40.20%


In [19]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

load_model = pickle.load(open("random_forest_model.pkl", "rb"))
load_model.score(x_test, y_test)

0.7843137254901961

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)