In [1]:
import numpy as np
import pandas as pd
import sklearn as ktl
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pylab as plt

In [2]:
bol = pd.read_csv("/Users/hendrixperalta/Desktop/bolivia/data/GeoDS4Bolivia.csv")
bol = bol.rename(columns={"asdf_id":"id"})

#list(bol.columns)
sat = pd.read_csv("data/egdp/satelite_data.csv")
df = pd.merge(bol, sat, on="id", how="outer")

df["total_land2012"] = df["agr_land2012"] + df["urb_land2012"]
df["perUrb_land"] = df["urb_land2012"] / df["total_land2012"]

In [3]:
x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds']

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

model = LinearRegression()
model.fit(x_train,y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 46.78424661796731
R^2 Score: -0.1496885300424471


## Iterate regression models

In [5]:
model_results = {}
    
y_variables = list(df.columns)
del y_variables[0:8]
del y_variables[195:410]

x = df[["ln_t400NTLpc2012", "tr400_pop2012", 'egdp2012', 'perUrb_land']].fillna(0)

for y_variable in y_variables:
    y = df[y_variable].fillna(0)
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

    model = LinearRegression()
    model.fit(x_train,y_train)

    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    model_results[y_variable] = r2

#    print(f"For the variable {y_variable} Mean Squared Error:", mse)
#    print("R^2 Score:", r2)

TypeError: '>' not supported between instances of 'dict' and 'float'

In [7]:
#y_variables.index('perUrb_land')

# **Categorical analysis**

In [8]:
quantile_labels = ["low", 'medium-low', 'medium-high', 'high']
df["imds_quantile"] = pd.qcut(df["imds"], q=4, labels=quantile_labels)

In [9]:
df["imds_quantile"]

0      medium-high
1       medium-low
2       medium-low
3             high
4              low
          ...     
334           high
335     medium-low
336    medium-high
337           high
338           high
Name: imds_quantile, Length: 339, dtype: category
Categories (4, object): ['low' < 'medium-low' < 'medium-high' < 'high']

In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

x = df[["ln_t400NTLpc2012", "pop2012", 'egdp2012', 'perUrb_land']].fillna(0)
y = df['imds_quantile']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3) 

#clf.get_params()
clf.fit(x_train,y_train);

y_preds = clf.predict(x_test)

In [11]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [12]:
#evaluate the model
print(f" Train data score: {clf.score(x_train, y_train)*100:.2f}%")
print(f" Test data score: {clf.score(x_test, y_test)*100:.2f}%")

 Train data score: 100.00%
 Test data score: 41.18%


In [13]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

        high       0.40      0.67      0.50        18
         low       0.50      0.38      0.43        32
 medium-high       0.38      0.31      0.34        29
  medium-low       0.38      0.39      0.38        23

    accuracy                           0.41       102
   macro avg       0.41      0.44      0.41       102
weighted avg       0.42      0.41      0.41       102



In [14]:
print(confusion_matrix(y_test, y_preds))

[[12  0  3  3]
 [ 5 12  7  8]
 [11  5  9  4]
 [ 2  7  5  9]]


In [15]:
accuracy_score(y_test, y_preds)

0.4117647058823529

In [16]:
# improve model

np.random.seed(33)
for i in range(10, 100, 10):
    print(f"Trying model with {i} stimators")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model acurracy is {clf.score(x_test, y_test)*100:.2f}%")

Trying model with 10 stimators
Model acurracy is 43.14%
Trying model with 20 stimators
Model acurracy is 40.20%
Trying model with 30 stimators
Model acurracy is 45.10%
Trying model with 40 stimators
Model acurracy is 37.25%
Trying model with 50 stimators
Model acurracy is 43.14%
Trying model with 60 stimators
Model acurracy is 40.20%
Trying model with 70 stimators
Model acurracy is 41.18%
Trying model with 80 stimators
Model acurracy is 43.14%
Trying model with 90 stimators
Model acurracy is 39.22%


In [17]:
#pickle.dump(clf, open("random_forest_model.pkl", "wb"))

load_model = pickle.load(open("random_forest_model.pkl", "rb"))
load_model.score(x_test, y_test)

0.8235294117647058

## Getting data ready

2. handling missing values
3. converting non-num in to num variables (encoding)