In [32]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
sat = pd.read_csv("data/egdp/satelite_data.csv")
df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

In [3]:
x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 31.501944144931485
R^2 Score: 0.23074984778316943


**categorical analysis**

In [5]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)

In [6]:
df["imds_quantile"]

0      medium-high
1       medium-low
2       medium-low
3             high
4              low
          ...     
334           high
335     medium-low
336    medium-high
337           high
338           high
Name: imds_quantile, Length: 339, dtype: category
Categories (4, object): ['low' < 'medium-low' < 'medium-high' < 'high']

In [13]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [23]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [31]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 48.04%


In [17]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.70      0.66      0.68        29
         low       0.40      0.36      0.38        28
 medium-high       0.42      0.46      0.44        24
  medium-low       0.29      0.33      0.31        21

    accuracy                           0.46       102
   macro avg       0.45      0.45      0.45       102
weighted avg       0.47      0.46      0.46       102



In [33]:
print(confusion_matrix(y_test, y_preds))

[[19  1  6  3]
 [ 2 10  5 11]
 [ 4  6 11  3]
 [ 2  8  4  7]]


In [34]:
accuracy_score(y_test, y_preds)

0.46078431372549017

In [24]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 47.06%
Trying model with 20 stimators
Model acurracy is 45.10%
Trying model with 30 stimators
Model acurracy is 48.04%
Trying model with 40 stimators
Model acurracy is 49.02%
Trying model with 50 stimators
Model acurracy is 49.02%
Trying model with 60 stimators
Model acurracy is 47.06%
Trying model with 70 stimators
Model acurracy is 48.04%
Trying model with 80 stimators
Model acurracy is 47.06%
Trying model with 90 stimators
Model acurracy is 48.04%


In [29]:
pickle.dump(clf, open("random_forest_model.pkl", "wb"))

load_model = pickle.load(open("random_forest_model.pkl", "rb"))
load_model.score(x_test, y_test)

0.4803921568627451