In [6]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
# import altair as alt
# alt.renderers.enable('notebook')

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler       
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV     
from sklearn.pipeline import make_pipeline    

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold

from statsmodels.tools import eval_measures
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold

from sklearn.model_selection import KFold

from statsmodels.tools import eval_measures
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold

from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

In [2]:
poverty_data = pd.read_csv("poverty/src/data/poverty_data_with_dummy.csv")

In [3]:
poverty_data_no_string = poverty_data.drop("idhogar", axis = 1)
poverty_data_no_string = poverty_data_no_string.drop("Id", axis = 1)

In [4]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    poverty_data_no_string.drop("Target", axis = 1),
    poverty_data_no_string.Target,
    test_size=0.30, 
    random_state=11
)

In [5]:
# selector, threshold for feature selection
selecter = SelectPercentile()
threshold = VarianceThreshold(.1)

# number of folds for cross validation
folds = KFold(n_splits=10, shuffle=True, random_state=11)

## Naive Bayes 

In [9]:
nb_scaler = MinMaxScaler()
nb_clf = MultinomialNB()

nb_poly = PolynomialFeatures()

nb_pipe = make_pipeline(nb_poly, threshold,  nb_scaler, selecter, nb_clf)

##
# specify parameters and distributions to sample from
nb_param_dist = {"multinomialnb__fit_prior": [True, False],
              "multinomialnb__alpha": np.random.random_sample((20,)),
                 'polynomialfeatures__degree':sp_randint(1, 3),
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
nb_rand = RandomizedSearchCV(nb_pipe, param_distributions=nb_param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

nb_fit = nb_rand.fit(train_features, train_outcome)

In [10]:
nb_fit.best_params_

{'multinomialnb__alpha': 0.5384809204648233,
 'multinomialnb__fit_prior': True,
 'polynomialfeatures__degree': 1,
 'selectpercentile__percentile': 28}

In [11]:
nb_fit.best_score_

0.6483697277894107

## Random Forest

In [12]:
rf_scaler = MinMaxScaler()
rf_clf = RandomForestClassifier(n_estimators=100)
rf_poly = PolynomialFeatures()
rf_pipe = make_pipeline(rf_poly, threshold, selecter, rf_clf)

# specify parameters and distributions to sample from
param_dist = {"randomforestclassifier__max_depth": [3, None],
              "randomforestclassifier__max_features": sp_randint(1, 11),
              "randomforestclassifier__min_samples_split": sp_randint(2, 11),
              "randomforestclassifier__bootstrap": [True, False],
              "randomforestclassifier__criterion": ["gini", "entropy"],
             'polynomialfeatures__degree':sp_randint(1, 3),
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
rf_rand = RandomizedSearchCV(rf_pipe, param_distributions=param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

rf_fit = rf_rand.fit(train_features, train_outcome)

In [13]:
rf_fit.best_params_

{'polynomialfeatures__degree': 2,
 'randomforestclassifier__bootstrap': False,
 'randomforestclassifier__criterion': 'entropy',
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__max_features': 2,
 'randomforestclassifier__min_samples_split': 3,
 'selectpercentile__percentile': 15}

In [14]:
rf_fit.best_score_

0.8945557882141789

## Neural Network Models

In [15]:
from sklearn.neural_network import MLPClassifier

In [16]:
mlp_scaler = MinMaxScaler()
mlp_clf = MLPClassifier(solver='lbfgs',alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1)
mlp_poly = PolynomialFeatures()

mlp_pipe = make_pipeline(mlp_poly, threshold, mlp_scaler, selecter, mlp_clf)

mlp_param_dist = {'polynomialfeatures__degree':sp_randint(1, 3),
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
mlp_rand = RandomizedSearchCV(mlp_pipe, param_distributions=mlp_param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

mlp_fit = mlp_rand.fit(train_features, train_outcome)

In [17]:
mlp_fit.best_params_

{'polynomialfeatures__degree': 2, 'selectpercentile__percentile': 23}

In [18]:
mlp_fit.best_score_

0.7008674842955429

## RBF SVM

In [19]:
from sklearn.svm import SVC

In [20]:
rbf_scaler = MinMaxScaler()
rbf_clf = SVC()
rbf_poly = PolynomialFeatures()

rbf_pipe = make_pipeline(rbf_poly, threshold, rbf_scaler, selecter, rbf_clf)

rbf_param_dist = {'svc__gamma': ['auto','scale'],
                 'polynomialfeatures__degree':sp_randint(1, 3),
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
rbf_rand = RandomizedSearchCV(rbf_pipe, param_distributions=rbf_param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

rbf_fit = rbf_rand.fit(train_features, train_outcome)

TypeError: must be real number, not str

In [None]:
rbf_fit.best_params_

In [None]:
rbf_fit.best_score_

## Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [None]:
gpc_scaler = MinMaxScaler()
kernel = 1.0 * RBF(1.0)
gpc_clf = GaussianProcessClassifier(kernel = kernel, random_state=0)
gpc_poly = PolynomialFeatures()

gpc_pipe = make_pipeline(gpc_poly, threshold, gpc_scaler, selecter, gpc_clf)

gpc_param_dist = {'polynomialfeatures__degree':sp_randint(1, 3),
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
gpc_rand = RandomizedSearchCV(gpc_pipe, param_distributions=gpc_param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

gpc_fit = gpc_rand.fit(train_features, train_outcome)

In [None]:
gpc_fit.best_params_

In [None]:
gpc_fit.best_score_

## Nearest Neighbors 

In [21]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_scaler = MinMaxScaler()
knn_clf = KNeighborsClassifier(algorithm = 'auto')
knn_poly = PolynomialFeatures()

knn_pipe = make_pipeline(knn_poly, threshold, knn_scaler, selecter, knn_clf)

knn_param_dist = {'polynomialfeatures__degree':sp_randint(1, 3),
                  'kneighborsclassifier__n_neighbors':sp_randint(1, 5),
                  'kneighborsclassifier__weights':['uniform','distance'],
             'selectpercentile__percentile':sp_randint(10, 30)}

n_iter_search = 20
knn_rand = RandomizedSearchCV(knn_pipe, param_distributions=knn_param_dist, 
                             n_iter=n_iter_search, scoring="accuracy", cv=folds)

knn_fit = knn_rand.fit(train_features, train_outcome)

In [None]:
knn_fit.best_params_

In [None]:
knn_fit.best_score_