## DSI Project 4 - Analysis

In [55]:
# We will first import libraries
# Import libraries
import pandas as pd
import numpy as np # linear algebra
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, r2_score, mean_squared_error, explained_variance_score
from warnings import simplefilter
# Optionally turn off warnings once the models are producing good results
#   non convergence may mean the result is not perfect, but it might be good enough
# Credit to Jamie Shaffer
simplefilter("ignore", category=0)

In [56]:
class data_explorer:
    # This class takes in a DataFrame and then does some initial exploration
    def __init__(self, path):
        # This initalizes and loads all our past datasets
        self.dframe = pd.read_csv(path)
        self.dtypes = self.dframe.dtypes
        self.shape = self.dframe.shape
        self.nulls = self.dframe.isnull().mean()
    
    # This method explores the dataframe dtypes and null values
    def explore(self):
        # Let's check out the shape of our DataFrame    
        print(f'DataFrame has {self.dframe.shape[1]} columns and {self.dframe.shape[0]} rows.')   
        
        # First we'll look for null values and Data Types
        i = 1
        for item1, item2 in zip(self.nulls, self.dtypes):
            p = 0

            if item1 != 0:
                print(f'Column {i} has {item1} null items')
                p += 1
            print(f'Column {i} is {item2}')
            i += 1   
        if p == 0:
            print(f'DataFrame has zero null values.')
            
    # Drop selected columns
    def col_drop(self, drop_list, inplace = True):
        self.dframe.drop(columns=drop_list, inplace=True)
    
    # Check out why an object column isn't numeric
    def num_check(self, column_name):
        print(self.dframe[self.dframe[column_name].str.isnumeric() == False])
     
    # Displays a column is Descending Order
    def view_asc(self, column_name, asc = False):
        temp = self.dframe[column_name].sort_values(ascending = asc)
        return temp
    
    # This will set any column values you choose in your selected column to NAN
    def col_nan(self, column_name, s):
        self.dframe[column_name].apply(lambda x: np.nan if x == s else int(x))
    
    # This will show us what the shape of the DFrame will be if we drop all NAN values
    def s_if_drop(self):
        self.dframe.dropna().shape
    
    # This method will fill any NAN value in the DataFrame with 0
    def fill_na(self):
        self.dframe.fillna(value = 0)
    
    # This will build a new DataFrame with just the columns we desire
    def df_builder(self, column_list):
        new_df = pd.DataFrame()
        for item in column_list:
            new_df[item] = self.dframe[item]
        return new_df

In [57]:
# Create a data_explorer class
df_o = data_explorer('/data/hackathon.csv')

In [58]:
# Create a deap copy of our dataframe
df = df_o.dframe.copy()

In [59]:
# This class does the following:
# Does a train/test/split on an input dataframe, with the features and target taken as input
# It then scales the data using StandardScaler
class analyzer:
    def __init__(self, df, features, target):
        self.df = df
        self.X = self.df[features]
        self.y = self.df[target]
        # This splits our data for testing     
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.33, random_state=42, stratify=self.y)
        
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        
        # Scale our data.
        sc = StandardScaler()
        self.Z_train = sc.fit_transform(self.X_train)
        self.Z_test = sc.transform(self.X_test)
        
    def go(self):
        # This method runs four models through pipeslines
        
        
        # Model #1
        # We'll start out by running a LogisticRegression
        print('Logistic Regression:')
        print('-------------- \n')
        c = np.logspace(.001, 1, 100)
        # Set params
        pipe_params = {
            'C' : c
            }
        # Gonna start with LogisticRegression
        lr = LogisticRegression()
        gs = GridSearchCV(estimator = lr, param_grid = pipe_params, cv = 5, n_jobs=-1)
        gs.fit(self.Z_train, self.y_train)
        gs_model = gs.best_estimator_
        preds = gs_model.predict(self.Z_test)
        
        # Calculate and print our metrics
        print(f'Best Score: {np.round(gs.best_score_,3)} \n')

        print(f'X_train, y_train Score: {np.round(gs_model.score(self.Z_train, self.y_train), 3)}')
        print(f'X_test, y_test Score: {np.round(gs_model.score(self.Z_test, self.y_test), 3)} \n')

        cm = confusion_matrix(self.y_test, preds)
        TP = cm[1][1]
        TN = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        print(f'True Positives: {TP}, True Negatives: {TN}, False Positives: {FP}, False Negatives: {FN} \n')

        print(f'{classification_report(self.y_test, preds)} \n')

        print('--------------------------------- \n')
        print('1 of 3  Models Finished \n')  
        
        
        
        # Model #2
        # Now we'll run MultinomialNB
        print(f'MultinomialNB')
        print('-------------- \n')
        
        alphas= np.linspace(.001, 1, 100)
        pipe = Pipeline([
            ('mnb', MultinomialNB())
        ])

        # Set up pipeline parameters
        pipe_params = {
            'mnb__alpha' : alphas
#             'mnb__fit_prior' : [True, False]
        }
        
        gs = GridSearchCV(pipe,
        pipe_params,
        cv = 5,
        n_jobs=-1) # 5-fold cross-validation.
        
        # Fit GridSearch to training data.
        gs.fit(self.X_train, self.y_train);
        gs_model = gs.best_estimator_
        preds = gs_model.predict(self.X_test)

        # Calculate and print our metrics
        print(f'Best Score: {np.round(gs.best_score_,3)} \n')

        print(f'X_train, y_train Score: {np.round(gs_model.score(self.X_train, self.y_train),3)}')
        print(f'X_test, y_test Score: {np.round(gs_model.score(self.X_test, self.y_test),3)} \n')

        cm = confusion_matrix(self.y_test, preds)
        TP = cm[1][1]
        TN = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        print(f'True Positives: {TP}, True Negatives: {TN}, False Positives: {FP}, False Negatives: {FN} \n')

        print(f'{classification_report(self.y_test, preds)} \n')

        print('--------------------------------- \n')        
        print('2 of 3  Models Finished \n')
        
        
        
        
        # Model #3
        # Now we'll run a LinearSVC
        print('LinearSVC (This is gonna take a while...):')
        print('--------- \n')
        
        pgrid = {"C": np.linspace(0.0001, 1, 20)}

        svc = LinearSVC(max_iter = 5000)
        gs = GridSearchCV(estimator = svc, param_grid = pgrid, cv = 5, n_jobs=-1)
        gs.fit(self.Z_train, self.y_train)
        gs_model = gs.best_estimator_
        preds = gs_model.predict(self.Z_test)
      

        # Calculate and print our metrics
        print(f'Best Score: {np.round(gs.best_score_,3)} \n')

        print(f'X_train, y_train Score: {np.round(gs_model.score(self.Z_train, self.y_train), 3)}')
        print(f'X_test, y_test Score: {np.round(gs_model.score(self.Z_test, self.y_test), 3)} \n')

        cm = confusion_matrix(self.y_test, preds)
        TP = cm[1][1]
        TN = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        print(f'True Positives: {TP}, True Negatives: {TN}, False Positives: {FP}, False Negatives: {FN} \n')

        print(f'{classification_report(self.y_test, preds)} \n')

        print('--------------------------------- \n')     
        print('3 of 3  Models Finished \n')
        
        
        # Model 2
        # Now we'll run RandomForestClassifier
        print('RandomForestClassifier:')
        print('----------------------- \n')
        pgrid = {
            'n_estimators':[116],
            'max_features':[None],
            'max_depth':[11]
        }
        
        rf = RandomForestClassifier()
        gs = GridSearchCV(estimator = rf, param_grid = pgrid, cv = 5, n_jobs=-1)
        gs.fit(self.Z_train, self.y_train)
        gs_model = gs.best_estimator_
        preds = gs_model.predict(self.Z_test)
        
        # Now we'll calculate and print our metrics
#         print(f'Best Score: {print(gs.best_score_)} \n')
        print(f'X_train, y_train Score: {np.round(gs_model.score(self.Z_train, self.y_train), 3)}')
        print(f'X_test, y_test Score: {np.round(gs_model.score(self.Z_test, self.y_test), 3)} \n')
        
        cm = confusion_matrix(self.y_test, preds)
        TP = cm[1][1]
        TN = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        
        print(f'True Positives: {TP}, True Negatives: {TN}, False Positives: {FP}, False Negatives: {FN} \n')
        print(f'{classification_report(self.y_test, preds)} \n')
        print('--------------------------------- \n')     
        print('4 of 4  Models Finished \n')
        
        print('Analysis Finished')

In [61]:
# Define our features and our target
features = ['age', 'fnlwgt', 'education-num', 'hours-per-week', 'workclass_?', 'workclass_Never-worked', 'workclass_Private', 'workclass_Self-emp-inc',
            'workclass_Self-emp-not-inc', 'workclass_Without-pay', 'workclass-govt', 'married', 'single', 'separated', 'US', 'male', 'fnlwgt^2', 'hours-per-week^2']
target = 'wage'

In [62]:
# Create an analyzer object
a_obj = analyzer(df, features, target)

In [63]:
# Run analysis
a_obj.go()

Logistic Regression:
-------------- 

Best Score: 0.826 

X_train, y_train Score: 0.827
X_test, y_test Score: 0.827 

True Positives: 1358, True Negatives: 7532, False Positives: 626, False Negatives: 1230 

              precision    recall  f1-score   support

           0       0.86      0.92      0.89      8158
           1       0.68      0.52      0.59      2588

    accuracy                           0.83     10746
   macro avg       0.77      0.72      0.74     10746
weighted avg       0.82      0.83      0.82     10746
 

--------------------------------- 

1 of 3  Models Finished 

MultinomialNB
-------------- 

Best Score: 0.379 

X_train, y_train Score: 0.379
X_test, y_test Score: 0.376 

True Positives: 2009, True Negatives: 2033, False Positives: 6125, False Negatives: 579 

              precision    recall  f1-score   support

           0       0.78      0.25      0.38      8158
           1       0.25      0.78      0.37      2588

    accuracy                        