In [23]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import altair as alt
alt.renderers.enable('notebook')

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler       
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV     
from sklearn.pipeline import make_pipeline    

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold

from statsmodels.tools import eval_measures
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold

In [14]:
poverty_data = pd.read_csv("poverty/src/data/poverty_data_with_dummy.csv")

In [15]:
poverty_data_no_string = poverty_data.drop("idhogar", axis = 1)
poverty_data_no_string = poverty_data_no_string.drop("Id", axis = 1)

In [16]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    poverty_data_no_string.drop("Target", axis = 1),
    poverty_data_no_string.Target,
    test_size=0.30, 
    random_state=11
)

In [17]:
# selector, threshold for feature selection
selecter = SelectPercentile()
threshold = VarianceThreshold(.1)

# number of folds for cross validation
folds = KFold(n_splits=10, shuffle=True, random_state=11)

## Naive Bayes 

In [18]:
nb_scaler = MinMaxScaler()
nb_clf = MultinomialNB()

nb_poly = PolynomialFeatures()

nb_pipe = make_pipeline(nb_poly, threshold,  nb_scaler, selecter, nb_clf)


nb_param_grid = {'polynomialfeatures__degree':range(1, 3), 
                  'selectpercentile__percentile':range(10, 30, 5)}

nb_grid_search = GridSearchCV(nb_pipe, nb_param_grid, cv=folds)

nb_grid_search_fitted = nb_grid_search.fit(train_features, train_outcome)

In [19]:
nb_grid_search_fitted.best_params_

{'polynomialfeatures__degree': 1, 'selectpercentile__percentile': 25}

In [20]:
nb_score = eval_measures.meanabs(nb_grid_search_fitted.predict(test_features).astype(int), test_outcome)

print("Mean absolute error: " + str(nb_score))

Mean absolute error: 0.6144452198185625


## Random Forest

In [25]:
rf_scaler = MinMaxScaler()
rf_reg = RandomForestRegressor(random_state = 42)
rf_poly = PolynomialFeatures()
rf_pipe = make_pipeline(rf_poly, threshold, rf_scaler, selecter, rf_reg)


rf_param_grid = {'polynomialfeatures__degree':range(1, 3),
                 'randomforestregressor__n_estimators':range(95, 105),
                 'selectpercentile__percentile':range(10, 30, 5)}

rf_grid = GridSearchCV(rf_pipe, rf_param_grid, scoring="neg_mean_absolute_error", cv=folds)
rf_fit = rf_grid.fit(train_features, train_outcome)

KeyboardInterrupt: 

In [None]:
rf_fit.best_params_

In [None]:
score = eval_measures.meanabs(rf_fit.predict(test_features), test_outcome)

print("Mean Absolute Error: " + str(score))

## Adaboost

In [None]:
# define an adaboost pipeline
ada_scaler = MinMaxScaler()
ada_reg = AdaBoostRegressor()
ada_poly = PolynomialFeatures()

ada_pipe = make_pipeline(ada_poly, threshold, ada_scaler, selecter, ada_reg)

# define an adaboost grid for hyper tuning
ada_param_grid = {'polynomialfeatures__degree':range(1, 3), 
                  'adaboostregressor__learning_rate': [x / 5 for x in range(1, 5)],
                  'selectpercentile__percentile':range(10, 30, 5)}

ada_grid = GridSearchCV(ada_pipe, ada_param_grid, scoring="neg_mean_absolute_error", cv=folds)
ada_fit = ada_grid.fit(train_features, train_outcome)

In [None]:
ada_fit.best_params_

In [None]:
score = eval_measures.meanabs(ada_fit.predict(test_features), test_outcome)

print("Mean absolute error: " + str(score))