# Load in the data

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/gdrive')



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [3]:
%cd  /gdrive/MyDrive/364/'Logistic Regression Comp'/
%ls

/gdrive/MyDrive/364/Logistic Regression Comp
 Exploration.ipynb            'Model Testing and Selection.ipynb'
 LogReg-data.csv               processing.py
 LogReg-sample-features.csv    [0m[01;34m__pycache__[0m/
 LogReg-sample-target.csv      test.ipynb
 LogReg-TestingSubmission.py


In [4]:
df = pd.read_csv('LogReg-data.csv')
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,y
0,8.8,0.45,7.0,3.00,45.0,0.36,0.27,0.045,1.00100,20.7,170.0,0
1,9.5,0.49,6.3,3.30,14.0,0.34,0.30,0.049,0.99400,1.6,132.0,0
2,10.1,0.44,8.1,3.26,30.0,0.40,0.28,0.050,0.99510,6.9,97.0,0
3,9.9,0.40,7.2,3.19,47.0,0.32,0.23,0.058,0.99560,8.5,186.0,0
4,9.9,0.40,7.2,3.19,47.0,0.32,0.23,0.058,0.99560,8.5,186.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3150,8.9,0.46,5.7,3.22,41.0,0.20,0.22,0.044,0.99862,16.0,113.0,0
3151,10.1,0.50,5.7,3.30,23.0,0.24,0.26,0.059,0.99773,17.8,124.0,1
3152,10.1,0.50,5.7,3.30,23.0,0.24,0.26,0.059,0.99773,17.8,124.0,1
3153,11.0,0.42,6.0,3.15,22.0,0.26,0.20,0.049,0.99280,6.8,93.0,0


# Base Model

In [5]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from scipy.stats import loguniform

In [6]:
def evaluateRegression(x, scale='standard'):
  y= df['y'].to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)
  if scale == 'standard':
    pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000))
  elif scale == 'normalize':
    pipe = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=2000)) 
  else:
     pipe = make_pipeline(LogisticRegression(max_iter=2000)) 

  pipe.fit(X_train, y_train)  # apply scaling on training data
  print(pipe.score(X_test, y_test))
  


In [7]:
def evaluateGridRegression(x, scale='standard'):
  y= df['y'].to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)
  
  if scale == 'standard':
    pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000, solver='saga'))
  elif scale == 'normalize':
    pipe = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=2000, solver='saga')) 
  else:
    pipe = make_pipeline(LogisticRegression(max_iter=2000)) 
  
  space = dict()
  space['logisticregression__C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

  # define search
  search = GridSearchCV(pipe, space, scoring='accuracy')
   
  # execute search
  result = search.fit(X_train, y_train)

  # summarize result
  print('Best Score: %s' % result.best_score_)
  print('Best Hyperparameters: %s' % result.best_params_)

In [8]:
def evaluateRandomRegression(x, scale='standard'):
  y= df['y'].to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)
  
  if scale == 'standard':
    pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=2000, solver='saga'))
  elif scale == 'normalize':
    pipe = make_pipeline(MinMaxScaler(), LogisticRegression(max_iter=2000, solver='saga')) 
  else:
    pipe = make_pipeline(LogisticRegression(max_iter=2000)) 
  
  space = dict()
  space['logisticregression__C'] = loguniform(1e-5, 100)

  # define search
  search = RandomizedSearchCV(pipe, space, scoring='accuracy')
   
  # execute search
  result = search.fit(X_train, y_train)

  # summarize result
  print('Best Score: %s' % result.best_score_)
  print('Best Hyperparameters: %s' % result.best_params_)


In [9]:
X = df.drop(columns='y').to_numpy()
y= df['y'].to_numpy()
X_df = df.drop(columns='y')
y_df = df['y']


In [10]:

evaluateRegression(X, scale='standard'), evaluateRegression(X, scale='normalize'), evaluateRegression(X, scale=None)
# standard wins 


0.6909667194928685
0.687797147385103
0.7020602218700476


(None, None, None)

In [11]:
evaluateGridRegression(X, scale='standard')

Best Score: 0.6949237780920949
Best Hyperparameters: {'logisticregression__C': 0.1}


In [12]:
evaluateRandomRegression(X, scale='standard')

Best Score: 0.696904761904762
Best Hyperparameters: {'logisticregression__C': 0.03725908620728233}


# Models with processing

In [13]:
%pip install ipynb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import sys
sys.path.insert(0,'/gdrive/MyDrive/364')

In [15]:
# import processing
from processing import *

In [16]:
X1 = process_zero_col(X, method='mean')
evaluateRegression(X1)

0.6909667194928685


In [17]:
evaluateGridRegression(X1)

Best Score: 0.6953198176960553
Best Hyperparameters: {'logisticregression__C': 0.1}


In [18]:
evaluateRandomRegression(X1)

Best Score: 0.6973023730944524
Best Hyperparameters: {'logisticregression__C': 0.05131530666858142}


In [19]:
X2 = log_transform(X1)  
evaluateRegression(X2)

0.6957210776545166


In [20]:
X3 = col_mult(X1)
evaluateRegression(X3)

0.6973058637083994


In [21]:
X4= col_mult(X2)
evaluateRegression(X4)

0.6988906497622821


In [22]:
X5 = col_mult(X)
evaluateRegression(X5)

0.6973058637083994


In [23]:
X6 = col_div(X1)
evaluateRegression(X6)

0.705229793977813


In [31]:
X7 = col_div(X1)
X7 = col_mult(X7)
evaluateRegression(X7, scale='standard')

0.7179080824088748


In [25]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4, interaction_only=False) # 4 was the best
X8 = poly.fit_transform(X1)
evaluateRegression(X8, scale='standard')


### best model

0.722662440570523


In [None]:
evaluateGridRegression(X7), evaluateRandomRegression(X7)

In [26]:
X10 = log_transform(X8)

evaluateRegression(X10, scale='standard')

0.722662440570523


In [27]:
from processing import *

In [53]:
model = LogisticRegression(max_iter=3000, C=.1)
X_clean =  process(X)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=101)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7210776545166403

# Export Best Model

In [54]:
import pickle
pickle.dump(model, open("model.pickle", "wb"))