In [2]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from palmerpenguins import load_penguins
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#Silence!
import warnings
warnings.filterwarnings("ignore")

In [3]:
test = pd.read_csv("C:\\Users\\diamo\\OneDrive - Cal Poly\\MSBA\\Machine Learning\\Final\\CAH-201803-test.csv")

In [4]:
test

Unnamed: 0,id_num,Q1,Q2,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,2,Female,78,Conservative,College degree,White,Yes,Yes,No,"Yes, very religious",Pro-Choice,Yes,Yes,Behave no differently,4,5,1,Yes
1,3,Male,59,Moderate,High school or less,Black,Yes,Yes,Yes,"Yes, very religious",Pro-Choice,No,No,More Willing,5,4,5,No
2,4,Male,59,Moderate,High school or less,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,No,Behave no differently,4,5,1,Yes
3,6,Male,52,Moderate,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,No,Yes,Less Willing,5,4,4,No
4,11,Female,33,Moderate,High school or less,White,No,No,Yes,"Yes, somewhat religious",Pro-Choice,No,No,More Willing,5,5,4,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,327,Female,68,Moderate,Graduate degree,White,Yes,No,No,"Yes, very religious",Pro-life,Yes,No,Behave no differently,5,5,2,No
162,330,Male,20,Moderate,High school or less,White,Yes,Yes,Yes,No,Pro-Choice,No,No,Less Willing,5,2,5,No
163,331,Male,65,Conservative,College degree,Latino,Yes,No,No,No,Pro-Choice,Yes,No,Behave no differently,5,2,1,No
164,333,Female,54,Moderate,Graduate degree,White,Yes,No,No,No,Pro-Choice,No,No,Behave no differently,5,1,5,Yes


In [5]:
train = pd.read_csv("C:\\Users\\diamo\\OneDrive - Cal Poly\\MSBA\\Machine Learning\\Final\\CAH-201803-train.csv")

In [6]:
train

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164,325,Male,21,Republican,Conservative,Some college,White,Yes,No,Yes,No,Pro-Choice,Yes,No,Less Willing,5,2,5,No
165,328,Female,41,Republican,Liberal,Graduate degree,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-Choice,Yes,No,Less Willing,5,2,2,No
166,329,Male,60,Republican,Conservative,Some college,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,No,Yes,Behave no differently,5,5,4,Yes
167,332,Female,51,Republican,Conservative,Graduate degree,White,Yes,Yes,Yes,"Yes, very religious",Pro-life,Yes,No,Less Willing,2,5,1,No


# Classification Testing

In [7]:
ct = ColumnTransformer(
  [
    ("dummify", #Dummifying variables that need it
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'), #If we haven't trained on a column, don't use for predicting on test validation
    make_column_selector(dtype_include=object))#,
#    ("standardize", 
#    StandardScaler(), 
#    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

In [8]:
y_train = train['political_affiliation']
X_train = train.drop(['political_affiliation','id_num'],axis=1)

## LDA

In [9]:
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", LinearDiscriminantAnalysis())
])

grid = {'model__solver': ['svd','eigen'],
        'model__n_components': [1,2],
        'model__tol': [.0001,.001,.01,.00001]}
#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)

Best model Score: 0.5860294117647058
That model's parameters: {'model__n_components': 1, 'model__solver': 'svd', 'model__tol': 0.0001}


## QDA

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", QuadraticDiscriminantAnalysis())
])

grid = {'model__reg_param': [0.0,.5,1,-.5,-1],
        'model__tol': [.0001,.001,.01,.00001]}
#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)

Best model Score: 0.5800356506238858
That model's parameters: {'model__reg_param': 0.5, 'model__tol': 0.0001}


## Decision Trees

In [11]:
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", DecisionTreeClassifier())
])
#Grid search grid
grid = {'model__criterion': ['gini','entropy','log_loss'],
        'model__splitter': ['best','random'],
        'model__max_depth': [None,2,10,20],
        'model__min_samples_split': list(range(1,5,1)),
        'model__min_samples_leaf': list(range(1,51,10))
       }
#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)

Best model Score: 0.5981617647058823
That model's parameters: {'model__criterion': 'gini', 'model__max_depth': None, 'model__min_samples_leaf': 41, 'model__min_samples_split': 4, 'model__splitter': 'random'}


## KNN

In [12]:
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", KNeighborsClassifier())
])
#Grid search grid
grid = {'model__n_neighbors': list(range(2,100,1))}  # Values for n_neighbors to be tested

#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)
#KNN sad score :(

Best model Score: 0.4973262032085561
That model's parameters: {'model__n_neighbors': 21}


## SVC

In [14]:
from sklearn import svm

pipe = Pipeline([
    ("preprocessing", ct),
    ("model", SVC())
])
#Grid search grid
grid = {'model__kernel': ['linear','poly'],
       'model__class_weight': [{'Democrat':.35,'Republican':.33,'Independent':.32}], #Based on test data distr
       'model__C': [1,2]}  # Values for n_neighbors to be tested

#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)

Best model Score: 0.5802139037433155
That model's parameters: {'model__C': 2, 'model__class_weight': {'Democrat': 0.35, 'Republican': 0.33, 'Independent': 0.32}, 'model__kernel': 'linear'}


## Naive Bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", GaussianNB())
])
#Grid search grid
grid = {'model__var_smoothing': [1e-10,1e-8,1e-6],
       'model__priors':[[.35,.33,.32],[.5,.5,.0],[.4,.4,.2]]}
#Grid search with cross validation
grid_search = GridSearchCV(pipe, grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

#Get best n and corresponding model score
best_p = grid_search.best_params_
best_score = grid_search.best_score_

print("Best model Score:", best_score)
print("That model's parameters:", best_p)

Best model Score: 0.5689839572192513
That model's parameters: {'model__priors': [0.4, 0.4, 0.2], 'model__var_smoothing': 1e-06}


## Predictions

In [19]:
X_test = test.drop(['id_num'],axis=1)

In [24]:
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", LinearDiscriminantAnalysis(solver="svd",tol=0.0001,n_components=1))
])
pipe.fit(X_train,y_train)
# Make predictions on the test set
y_pred = pipe.predict(X_test)

#make into dataframe with id_num
res = pd.DataFrame({'id_num':test['id_num'],'political_affiliation_predicted':y_pred})
#turn into csv
res.to_csv('CAH_pred_lda.csv',index=False)
res

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Democrat
2,4,Democrat
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat


In [23]:
pipe = Pipeline([
    ("preprocessing", ct),
    ("model", GaussianNB(priors=[.4,.4,.2],var_smoothing=1e-6))
])
pipe.fit(X_train,y_train)
# Make predictions on the test set
y_pred = pipe.predict(X_test)

#make into dataframe with id_num
res = pd.DataFrame({'id_num':test['id_num'],'political_affiliation_predicted':y_pred})
#turn into csv
res.to_csv('CAH_pred_NB.csv',index=False)
res

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Independent
2,4,Republican
3,6,Republican
4,11,Independent
...,...,...
161,327,Democrat
162,330,Independent
163,331,Democrat
164,333,Democrat
