In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import altair as alt
alt.renderers.enable('notebook')

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler       
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV     
from sklearn.pipeline import make_pipeline    

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

from sklearn.model_selection import KFold

from statsmodels.tools import eval_measures
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import VarianceThreshold

In [3]:
poverty_data = pd.read_csv("poverty/src/data/poverty_data_with_dummy.csv")

In [4]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    poverty_data.drop("Target", axis = 1),
    poverty_data.Target,
    test_size=0.30, 
    random_state=11
)

In [5]:
# selector, threshold for feature selection
selecter = SelectPercentile()
threshold = VarianceThreshold(.1)

# number of folds for cross validation
folds = KFold(n_splits=10, shuffle=True, random_state=11)

In [None]:
nb_scaler = MinMaxScaler()
nb_clf = MultinomialNB()

nb_poly = PolynomialFeatures()

nb_pipe = make_pipeline(nb_poly, threshold,  nb_scaler, selecter, nb_clf)

# define a KNN grid and hyper tuning
nb_param_grid = {'polynomialfeatures__degree':range(1, 3), 
                  'selectpercentile__percentile':range(10, 30, 5)}

nb_grid_search = GridSearchCV(nb_pipe, nb_param_grid, cv=folds)

nb_grid_search_fitted = nb_grid_search.fit(train_features, train_outcome)