In [1]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
offset = 1000000  
sat = pd.read_csv("data/egdp/satelite_data.csv")
sat["lnEGDPpc2012"] = np.log((sat["egdp2012"]/ bol["pop2012"])+offset)


df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

In [3]:
x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 24.076492608736643
R^2 Score: 0.1401438966952928


# **Categorical analysis**

In [4]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)
#df["imds_quantile"]

In [5]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [6]:
#clf.get_params()

In [7]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 50.00%


In [8]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.77      0.59      0.67        17
         low       0.50      0.56      0.53        18
 medium-high       0.35      0.44      0.39        16
  medium-low       0.47      0.41      0.44        17

    accuracy                           0.50        68
   macro avg       0.52      0.50      0.50        68
weighted avg       0.52      0.50      0.51        68



In [9]:
print(confusion_matrix(y_test, y_preds))

[[10  2  3  2]
 [ 0 10  5  3]
 [ 3  3  7  3]
 [ 0  5  5  7]]


In [10]:
accuracy_score(y_test, y_preds)

0.5

In [11]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 44.12%
Trying model with 20 stimators
Model acurracy is 50.00%
Trying model with 30 stimators
Model acurracy is 41.18%
Trying model with 40 stimators
Model acurracy is 52.94%
Trying model with 50 stimators
Model acurracy is 47.06%
Trying model with 60 stimators
Model acurracy is 47.06%
Trying model with 70 stimators
Model acurracy is 45.59%
Trying model with 80 stimators
Model acurracy is 44.12%
Trying model with 90 stimators
Model acurracy is 50.00%


In [12]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

#load_model = pickle.load(open("random_forest_model.pkl", "rb"))
#load_model.score(x_test, y_test)

## Iterate regression models

In [13]:
y_variables = list(df.columns)
del y_variables[195:419]
del y_variables[10:13]
del y_variables[0:8]

#y_variables

In [14]:
model_results = {}
    
x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'egdp2012', 'perUrb_land']].fillna(0)

for y_variable in y_variables:
    y = df[y_variable].fillna(0)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

res_90 = {key: value for key, value in model_results.items() if value > 0.9}
res_80 = {key: value for key, value in model_results.items() if (value > 0.8 and value < 0.90)}
res_neg = {key: value for key, value in model_results.items() if value < 0}

#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)


In [15]:
res_90

{'sdg2_2_wow_abs': 0.9115734961745297,
 'sdg3_3_vih_abs': 0.929942104698416,
 'sdg4_1_fhs_abs': 0.9604142376309324,
 'sdg8_4_rem_abs': 0.9214722538211707,
 'sdg8_5_ompr_abs': 0.9763477443815362,
 'sdg8_5_ofpr_abs': 0.995178073721122,
 'sdg8_6_wlm_abs': 0.9939013335618302}

In [16]:
res_80

{'sdg3_3_tc_abs': 0.8226029332166525,
 'sdg3_7_bpw_abs': 0.8792643933839768,
 'sdg4_1_mhs_abs': 0.827577398319953,
 'sdg4_4_heu_abs': 0.8540858653686862,
 'sdg7_3_tee_abs': 0.8109979199055548,
 'sdg8_6_mlm_abs': 0.893504998837979,
 'sdg8_10_dbb_abs': 0.8998795697144106,
 'sdg11_1_ho_abs': 0.8326133579008866,
 'sdg16_6_aob_abs': 0.8681232137867354}

In [17]:
res_neg

{'sdg3_2_mrc': -0.049899894990672156,
 'sdg3_3_cdir': -0.15832479061440075,
 'sdg3_3_di': -0.0013864136858574572,
 'sdg3_3_imr': -0.12485265458158978,
 'sdg4_6_lr': -0.10107636525800179,
 'sdg4_c_qti': -0.003907201443036801,
 'sdg5_1_gpyp': -0.005319087135057465,
 'sdg6_2_sc': -0.04391872744397474,
 'sdg8_6_mlm': -0.2959600725857898,
 'sdg9_5_cd': -0.4631374932985628,
 'sdg16_1_rhr': -0.029231193861946503,
 'sdg16_6_pbec': -0.06693808871989737,
 'sdg17_5_pipc': -0.007271829167390109,
 'sdg3_3_cd_abs': -0.19394789641048749,
 'sdg3_3_pd_abs': -1.8505939222962473,
 'sdg3_3_mc_abs': -1.2637737459978302,
 'sdg6_1_wdc_abs': -0.18855150174741753,
 'sdg6_2_bsc_abs': -1.887606049714889,
 'sdg3_3_di_norm': -0.08529559857165503,
 'sdg3_3_imr_norm': -0.0810221963745199,
 'sdg3_7_afr_norm': -0.13456535496493305,
 'sdg4_c_qti_norm': -0.015383748045024737,
 'sdg5_1_gpyd_norm': -0.09877393806961732,
 'sdg8_6_mlm_norm': -0.006631747528199616,
 'sdg9_5_eutf_norm': -0.06830003385119832,
 'sdg10_2_iec_nor

In [18]:
#y_variables.index('perUrb_land')

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)