<a href="https://colab.research.google.com/github/ImTeddyGraham/Data-ML-Projects/blob/main/NBC_and_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from google.colab import files, drive

from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, log_loss, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Mounts google drive 
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Creates a dataframe using a CSV located in google drive 
dataset = pd.read_csv('/content/drive/MyDrive/adult.csv', na_values='?')

In [None]:
# DATA PREPROCESSING

# Adds missing columns names 
dataset.columns = ['Age','Workclass','fnlwgt','Education','Education-num',
                   'Marital-Status','Occupation','Relationship','Race','Sex',
                   'Capital-gain','Capital-loss','Hours-per-week',
                   'Native_country','Salary']

# Drops missing values 
dataset.dropna(inplace=True)

# Creates new label col assigning '>50K' to class '1' and '<=50K' to class '0'
dataset.loc[dataset['Salary'] == '>50K', '>50K'] = 1
dataset.loc[dataset['Salary'] == '<=50K', '>50K'] = 0

# Removes old label col 
dataset.drop('Salary', axis='columns', inplace=True)

# Creates an int label vector
labels = dataset['>50K'].astype(int)

In [None]:
# Drops a majority of features for simplification 
data_simple = dataset.drop(['Workclass','fnlwgt','Education','Marital-Status',
                            'Occupation','Relationship','Race','Sex','Capital-gain',
                            'Capital-loss','Native_country','>50K'],axis='columns')

# Recasts data to type float
data_simple.astype(np.float64)

# Used later for interpreting statistics 
target_names = ['class 0', 'class 1']

In [None]:
data_simple.head()

Unnamed: 0,Age,Education-num,Hours-per-week
0,25,7,40
1,38,9,50
2,28,12,40
3,44,10,40
5,34,6,30


In [None]:
# Partitions training and test data with a 80/20 split
X_train, X_test, Y_train, Y_test = train_test_split(data_simple, labels, test_size=0.2, random_state=0)

In [None]:
# NAIVE BAYES

# Creates and trains NBC model 
model_NBC = GaussianNB()
Y_pred_NBC = model_NBC.fit(X_train, Y_train)

# Predicts test and training data
Y_pred_NBC = model_NBC.predict(X_test)
Y_pred_NBC_train = model_NBC.predict(X_train)

In [None]:
# LOGISTIC REGRESSION WITH SGD 

# Creates a pipeline with standard scaler. Trains model. Predicts labels 
pipe = make_pipeline(StandardScaler(),SGDClassifier(loss='log'))
pipe.fit(X_train, Y_train)
Y_pred_pipe_train = pipe.predict(X_train)
Y_pred_pipe = pipe.predict(X_test)

# Creates model without scaling.  Trains model. Predicts labels 
model_LR = SGDClassifier(loss='log')
model_LR.fit(X_train, Y_train)
Y_pred_LR_train = model_LR.predict(X_train)
Y_pred_LR = model_LR.predict(X_test)

  intercept_init,


In [None]:
# NBC stats
print('NBC training stats\n', classification_report(Y_train, Y_pred_NBC_train, target_names=target_names), 
      '\n\n\nNBC test stats\n', classification_report(Y_test, Y_pred_NBC, target_names=target_names))
print('NBC training log loss', log_loss(Y_train, Y_pred_NBC_train),
      '\nNBC test log loss', log_loss(Y_test, Y_pred_NBC))
# Scaled model stats 
print('\n\nScaled-model training stats\n', classification_report(Y_train, Y_pred_pipe_train, target_names=target_names), 
      '\n\n\nScaled-model test stats\n', classification_report(Y_test, Y_pred_pipe, target_names=target_names))
print('Scaled-model training log loss', log_loss(Y_train, Y_pred_pipe_train),
      '\nScaled-mode test log loss', log_loss(Y_test, Y_pred_pipe))
print('\nScaled model bias and weight vector', np.hstack((pipe['sgdclassifier'].intercept_[:,None], pipe['sgdclassifier'].coef_)))

# Non-scaled model stats 
print('\n\nNon-scaled model training data stats\n', classification_report(Y_train, Y_pred_LR_train, target_names=target_names), 
      '\n\n\nNon-scaled  test data stats\n', classification_report(Y_test, Y_pred_LR, target_names=target_names))
print('Non-scaled model training log loss', log_loss(Y_train, Y_pred_LR_train),
      '\nNon-scaled model test log loss', log_loss(Y_test, Y_pred_LR))
print('\nNon-scaled model bias and weight vector', np.hstack((model_LR.intercept_[:,None], model_LR.coef_)))

NBC training stats
               precision    recall  f1-score   support

     class 0       0.82      0.93      0.87     27241
     class 1       0.65      0.38      0.47      8936

    accuracy                           0.79     36177
   macro avg       0.73      0.65      0.67     36177
weighted avg       0.78      0.79      0.77     36177
 


NBC test stats
               precision    recall  f1-score   support

     class 0       0.81      0.93      0.87      6773
     class 1       0.64      0.37      0.46      2272

    accuracy                           0.79      9045
   macro avg       0.73      0.65      0.67      9045
weighted avg       0.77      0.79      0.77      9045

NBC training log loss 7.089764943667887 
NBC test log loss 7.3049261087287105


Scaled-model training stats
               precision    recall  f1-score   support

     class 0       0.80      0.95      0.87     27241
     class 1       0.64      0.26      0.37      8936

    accuracy                      