In [47]:
import numpy as py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


In [16]:
penguins = sns.load_dataset('penguins')
penguins = penguins.dropna()
y = penguins.pop('species')
X_train, X_test, y_train, y_test = train_test_split(
    penguins, y, test_size=0.5, random_state=42)
X_train_nums = X_train.select_dtypes('float64')

ss = StandardScaler()

ss.fit(X_train_nums)
nums_df = pd.DataFrame(ss.transform(X_train_nums),
                      index=X_train_nums.index)
X_train_cat = X_train.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(X_train_cat)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_train_cat.index)

In [20]:
X_train_clean = pd.concat([nums_df, dums_df], axis=1)
X_train_clean

Unnamed: 0,0,1,2,3,x0_Dream,x0_Torgersen,x1_Male
160,0.362748,0.903276,-0.472344,-0.094599,1.0,0.0,0.0
237,0.973499,-0.977375,1.408317,2.512546,0.0,0.0,1.0
2,-0.725152,0.445820,-0.472344,-1.185963,0.0,1.0,0.0
121,-1.221387,1.360731,-0.255345,-0.882806,0.0,1.0,1.0
179,1.030757,0.954104,-0.110678,-0.519018,1.0,0.0,1.0
...,...,...,...,...,...,...,...
194,1.297960,1.004932,-0.400011,-0.822175,1.0,0.0,1.0
77,-1.316816,1.157417,-1.268008,-0.397756,0.0,1.0,1.0
112,-0.839667,0.293335,-0.617010,-1.246594,0.0,0.0,0.0
277,0.267318,-1.079032,1.335984,0.936132,0.0,0.0,1.0


In [21]:
X_test_nums = X_test.select_dtypes('float64')

ss = StandardScaler()

ss.fit(X_test_nums)
nums_df = pd.DataFrame(ss.transform(X_test_nums),
                      index=X_test_nums.index)
X_test_cat = X_test.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(X_test_cat)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_test_cat.index)

In [22]:
X_test_clean = pd.concat([nums_df, dums_df], axis=1)
X_test_clean

Unnamed: 0,0,1,2,3,x0_Dream,x0_Torgersen,x1_Male
30,-0.773744,-0.257901,-1.584752,-1.196636,1.0,0.0,0.0
317,0.531460,-1.327003,1.527156,0.880352,0.0,0.0,0.0
79,-0.315159,0.963929,-0.382424,-0.238026,0.0,1.0,1.0
201,1.042959,0.047556,-0.170249,-0.653424,1.0,0.0,0.0
63,-0.491537,0.505743,-0.594600,-0.174119,0.0,0.0,1.0
...,...,...,...,...,...,...,...
330,1.166424,-1.021546,1.102804,1.040120,0.0,0.0,0.0
310,0.637288,-1.123365,1.244255,0.976213,0.0,0.0,0.0
170,0.443271,0.709381,-0.736050,-0.941007,1.0,0.0,0.0
229,0.513822,-0.919726,1.032079,1.231842,0.0,0.0,1.0


In [5]:
#creating Pipelines for each column type: numeric, categorical with under 10 categories, categorical with over 10 categories

num_col_pipe = Pipeline(steps = [
    ('mm', MinMaxScaler())
])

sm_cat_col_pipe = Pipeline(steps = [
    ('ohe', OneHotEncoder(drop = 'first',
                          sparse = False 
                         )
    )
])




### Important Note About IterativeImputer text

This is straight up copied from : https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/
So if we use any of the below we need to rewrite and cite

In [6]:
#By default, imputation is performed in ascending order from the feature
# with the least missing values to the feature with the most.
# here are the strategy options: strategies = ['ascending', 'descending', 'roman', 'arabic', 'random']

#Max_iterations: It is possible that a large number of iterations may begin to bias or skew the
# estimate and that few iterations may be preferred. The number of iterations of the procedure can be 
# specified via the “max_iter” argument. Default value is 10

lrg_cat_col_pipe = Pipeline(steps = [
    ('ii', IterativeImputer())
])

In [7]:
#declaring some placeholder lists 
num_cols = []
sm_cat_cols = []
lrg_cat_cols = []

col_trans = ColumnTransformer(transformers=[
    ('numeric', num_col_pipe, num_cols),
    ('small_cat', sm_cat_col_pipe, sm_cat_cols),
    ('large_cat', lrg_cat_col_pipe, lrg_cat_cols )
])

In [11]:
model_pipe_knn = Pipeline(steps =[
    ('col_trans', col_trans),
    ('knn', KNeighborsClassifier())
])

In [36]:
test_dict = {'knn':KNeighborsClassifier(), 'lr':LogisticRegression()}

def basic_model_tests(X, y, test_dict):
    '''Takes in X and y data as well as dictionary of name:test entries to run'''
    scores = []
    for key, value in test_dict.items():
        test_model_loop = Pipeline(steps=[
            (key, value)
        ])
        test_model_loop.fit(X, y)
        score = test_model_loop.score(X, y)
        scores.append({'name':key, 'type':value, 'score':score})
    return scores

In [34]:
scores = basic_model_tests(X_train_clean, y_train, test_dict)

In [35]:
print(scores)

[{'name': 'knn', 'type': KNeighborsClassifier(), 'score': 0.9939759036144579}, {'name': 'lr', 'type': LogisticRegression(), 'score': 1.0}]


In [None]:
LogisticRegression()

In [37]:
test_list = [{'name':'knn', 'type': KNeighborsClassifier(), 'params':{'knn__n_neighbors': [3, 5, 7], 'knn__p': [1, 2, 3]}},
            {'name': 'lr', 'type': LogisticRegression(), 'params':{'lr__fit_intercept':[False], 'lr__C':[1, 1000, 1e12],
                                                                  'lr__solver':['liblinear','newton-cg', 'lbfgs']}}]
#fit_intercept=False, C= 1e12, solver='liblinear'

In [41]:
for x in test_list:
    print(x['type'])

KNeighborsClassifier()
LogisticRegression()


In [50]:
def model_tests(X, y, test_list):
    '''Takes in X and y data as well as a list of dictionaries'''
    scores = []
    for x in test_list:
        test_model_loop = Pipeline(steps=[
            (x['name'], x['type'])
        ])
        test_model_loop.fit(X, y)
#         score = test_model_loop.score(X, y)
#         scores.append({'name':x['name'], 'type':x['type'], 'score':score})
        
        gs_pipe = GridSearchCV(estimator=test_model_loop, param_grid=x['params'])
        gs_pipe.fit(X,y)
        scores.append(gs_pipe.cv_results_)
    return scores

In [54]:
scores = model_tests(X_train_clean, y_train, test_list)


for x in scores:
    
    print(pd.DataFrame(x))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.002404      0.000494         0.002791        0.000744   
1       0.002199      0.000401         0.003002        0.000011   
2       0.002195      0.000399         0.002801        0.000400   
3       0.001999      0.000003         0.002800        0.000400   
4       0.002002      0.000005         0.002801        0.000401   
5       0.002204      0.000402         0.002598        0.000491   
6       0.002002      0.000009         0.002599        0.000486   
7       0.002200      0.000404         0.002203        0.000392   
8       0.001998      0.000003         0.002999        0.000015   

  param_knn__n_neighbors param_knn__p                                params  \
0                      3            1  {'knn__n_neighbors': 3, 'knn__p': 1}   
1                      3            2  {'knn__n_neighbors': 3, 'knn__p': 2}   
2                      3            3  {'knn__n_neighbors': 3, 'knn__p': 3}   
3            

In [5]:
#creating Pipelines for each column type: numeric, categorical with under 10 categories, categorical with over 10 categories

num_col_pipe = Pipeline(steps = [
    ('mm', MinMaxScaler())
])

sm_cat_col_pipe = Pipeline(steps = [
    ('ohe', OneHotEncoder(drop = 'first',
                          sparse = False 
                         )
    )
])




### Important Note About IterativeImputer text

This is straight up copied from : https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/
So if we use any of the below we need to rewrite and cite

In [6]:
#By default, imputation is performed in ascending order from the feature
# with the least missing values to the feature with the most.
# here are the strategy options: strategies = ['ascending', 'descending', 'roman', 'arabic', 'random']

#Max_iterations: It is possible that a large number of iterations may begin to bias or skew the
# estimate and that few iterations may be preferred. The number of iterations of the procedure can be 
# specified via the “max_iter” argument. Default value is 10

lrg_cat_col_pipe = Pipeline(steps = [
    ('ii', IterativeImputer())
])

In [5]:
#creating Pipelines for each column type: numeric, categorical with under 10 categories, categorical with over 10 categories

num_col_pipe = Pipeline(steps = [
    ('mm', MinMaxScaler())
])

sm_cat_col_pipe = Pipeline(steps = [
    ('ohe', OneHotEncoder(drop = 'first',
                          sparse = False 
                         )
    )
])




### Important Note About IterativeImputer text

This is straight up copied from : https://machinelearningmastery.com/iterative-imputation-for-missing-values-in-machine-learning/
So if we use any of the below we need to rewrite and cite

In [6]:
#By default, imputation is performed in ascending order from the feature
# with the least missing values to the feature with the most.
# here are the strategy options: strategies = ['ascending', 'descending', 'roman', 'arabic', 'random']

#Max_iterations: It is possible that a large number of iterations may begin to bias or skew the
# estimate and that few iterations may be preferred. The number of iterations of the procedure can be 
# specified via the “max_iter” argument. Default value is 10

lrg_cat_col_pipe = Pipeline(steps = [
    ('ii', IterativeImputer())
])

In [7]:
#declaring some placeholder lists 
num_cols = []
sm_cat_cols = []
lrg_cat_cols = []

col_trans = ColumnTransformer(transformers=[
    ('numeric', num_col_pipe, num_cols),
    ('small_cat', sm_cat_col_pipe, sm_cat_cols),
    ('large_cat', lrg_cat_col_pipe, lrg_cat_cols )
])

In [11]:
model_pipe_knn = Pipeline(steps =[
    ('col_trans', col_trans),
    ('knn', KNeighborsClassifier())
])

In [36]:
test_dict = {'knn':KNeighborsClassifier(), 'lr':LogisticRegression()}

def basic_model_tests(X, y, test_dict):
    '''Takes in X and y data as well as dictionary of name:test entries to run'''
    scores = []
    for key, value in test_dict.items():
        test_model_loop = Pipeline(steps=[
            (key, value)
        ])
        test_model_loop.fit(X, y)
        score = test_model_loop.score(X, y)
        scores.append({'name':key, 'type':value, 'score':score})
    return scores

In [34]:
scores = basic_model_tests(X_train_clean, y_train, test_dict)

In [35]:
print(scores)

[{'name': 'knn', 'type': KNeighborsClassifier(), 'score': 0.9939759036144579}, {'name': 'lr', 'type': LogisticRegression(), 'score': 1.0}]


In [None]:
LogisticRegression()

In [37]:
test_list = [{'name':'knn', 'type': KNeighborsClassifier(), 'params':{'knn__n_neighbors': [3, 5, 7], 'knn__p': [1, 2, 3]}},
            {'name': 'lr', 'type': LogisticRegression(), 'params':{'lr__fit_intercept':[False], 'lr__C':[1, 1000, 1e12],
                                                                  'lr__solver':['liblinear','newton-cg', 'lbfgs']}}]
#fit_intercept=False, C= 1e12, solver='liblinear'

In [41]:
for x in test_list:
    print(x['type'])

KNeighborsClassifier()
LogisticRegression()


In [50]:
def model_tests(X, y, test_list):
    '''Takes in X and y data as well as a list of dictionaries'''
    scores = []
    for x in test_list:
        test_model_loop = Pipeline(steps=[
            (x['name'], x['type'])
        ])
        test_model_loop.fit(X, y)
#         score = test_model_loop.score(X, y)
#         scores.append({'name':x['name'], 'type':x['type'], 'score':score})
        
        gs_pipe = GridSearchCV(estimator=test_model_loop, param_grid=x['params'])
        gs_pipe.fit(X,y)
        scores.append(gs_pipe.cv_results_)
    return scores

In [54]:
scores = model_tests(X_train_clean, y_train, test_list)


for x in scores:
    
    print(pd.DataFrame(x))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.002404      0.000494         0.002791        0.000744   
1       0.002199      0.000401         0.003002        0.000011   
2       0.002195      0.000399         0.002801        0.000400   
3       0.001999      0.000003         0.002800        0.000400   
4       0.002002      0.000005         0.002801        0.000401   
5       0.002204      0.000402         0.002598        0.000491   
6       0.002002      0.000009         0.002599        0.000486   
7       0.002200      0.000404         0.002203        0.000392   
8       0.001998      0.000003         0.002999        0.000015   

  param_knn__n_neighbors param_knn__p                                params  \
0                      3            1  {'knn__n_neighbors': 3, 'knn__p': 1}   
1                      3            2  {'knn__n_neighbors': 3, 'knn__p': 2}   
2                      3            3  {'knn__n_neighbors': 3, 'knn__p': 3}   
3            

In [7]:
#declaring some placeholder lists 
num_cols = []
sm_cat_cols = []
lrg_cat_cols = []

col_trans = ColumnTransformer(transformers=[
    ('numeric', num_col_pipe, num_cols),
    ('small_cat', sm_cat_col_pipe, sm_cat_cols),
    ('large_cat', lrg_cat_col_pipe, lrg_cat_cols )
])

In [11]:
model_pipe_knn = Pipeline(steps =[
    ('col_trans', col_trans),
    ('knn', KNeighborsClassifier())
])

In [36]:
test_dict = {'knn':KNeighborsClassifier(), 'lr':LogisticRegression()}

def basic_model_tests(X, y, test_dict):
    '''Takes in X and y data as well as dictionary of name:test entries to run'''
    scores = []
    for key, value in test_dict.items():
        test_model_loop = Pipeline(steps=[
            (key, value)
        ])
        test_model_loop.fit(X, y)
        score = test_model_loop.score(X, y)
        scores.append({'name':key, 'type':value, 'score':score})
    return scores

In [34]:
scores = basic_model_tests(X_train_clean, y_train, test_dict)

In [35]:
print(scores)

[{'name': 'knn', 'type': KNeighborsClassifier(), 'score': 0.9939759036144579}, {'name': 'lr', 'type': LogisticRegression(), 'score': 1.0}]


In [None]:
LogisticRegression()

In [37]:
test_list = [{'name':'knn', 'type': KNeighborsClassifier(), 'params':{'knn__n_neighbors': [3, 5, 7], 'knn__p': [1, 2, 3]}},
            {'name': 'lr', 'type': LogisticRegression(), 'params':{'lr__fit_intercept':[False], 'lr__C':[1, 1000, 1e12],
                                                                  'lr__solver':['liblinear','newton-cg', 'lbfgs']}}]
#fit_intercept=False, C= 1e12, solver='liblinear'

In [41]:
for x in test_list:
    print(x['type'])

KNeighborsClassifier()
LogisticRegression()


In [50]:
def model_tests(X, y, test_list):
    '''Takes in X and y data as well as a list of dictionaries'''
    scores = []
    for x in test_list:
        test_model_loop = Pipeline(steps=[
            (x['name'], x['type'])
        ])
        test_model_loop.fit(X, y)
#         score = test_model_loop.score(X, y)
#         scores.append({'name':x['name'], 'type':x['type'], 'score':score})
        
        gs_pipe = GridSearchCV(estimator=test_model_loop, param_grid=x['params'])
        gs_pipe.fit(X,y)
        scores.append(gs_pipe.cv_results_)
    return scores

In [54]:
scores = model_tests(X_train_clean, y_train, test_list)


for x in scores:
    
    print(pd.DataFrame(x))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.002404      0.000494         0.002791        0.000744   
1       0.002199      0.000401         0.003002        0.000011   
2       0.002195      0.000399         0.002801        0.000400   
3       0.001999      0.000003         0.002800        0.000400   
4       0.002002      0.000005         0.002801        0.000401   
5       0.002204      0.000402         0.002598        0.000491   
6       0.002002      0.000009         0.002599        0.000486   
7       0.002200      0.000404         0.002203        0.000392   
8       0.001998      0.000003         0.002999        0.000015   

  param_knn__n_neighbors param_knn__p                                params  \
0                      3            1  {'knn__n_neighbors': 3, 'knn__p': 1}   
1                      3            2  {'knn__n_neighbors': 3, 'knn__p': 2}   
2                      3            3  {'knn__n_neighbors': 3, 'knn__p': 3}   
3            