---
self-contained: true
title: "GSB 544-Final-Classification"
author: "Ruojia Kuang"
format:
  html: 
    theme: cosmo
---

In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [56]:
train_data = pd.read_csv('/Users/ruojiakuang/Desktop/GSB S544 Computing and Machine Learning for Business Analytics/FInal Code/gsb-544-fall-2023-political-affiliations/CAH-201803-train.csv')
train_data.head()

Unnamed: 0,id_num,Q1,Q2,political_affiliation,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18
0,1,Male,53,Independent,Liberal,College degree,Black,No,No,No,"Yes, somewhat religious",Pro-Choice,No,No,Behave no differently,5,2,5,No
1,5,Female,66,Independent,Conservative,Some college,White,Yes,No,Yes,"Yes, very religious",Pro-life,Yes,Yes,Less Willing,4,5,4,No
2,7,Female,58,Democrat,Liberal,College degree,White,No,No,No,"Yes, very religious",Pro-Choice,No,No,Behave no differently,5,1,4,Yes
3,8,Male,55,Independent,Moderate,High school or less,White,Yes,Yes,Yes,"Yes, somewhat religious",Pro-life,Yes,Yes,Less Willing,4,5,4,Yes
4,9,Male,64,Republican,Conservative,High school or less,White,Yes,Yes,Yes,No,Pro-life,No,No,Behave no differently,5,1,1,Yes


In [57]:
train_data.dtypes

id_num                    int64
Q1                       object
Q2                        int64
political_affiliation    object
Q4                       object
Q5                       object
Q6                       object
Q7                       object
Q8                       object
Q9                       object
Q10                      object
Q11                      object
Q12                      object
Q13                      object
Q14                      object
Q15                       int64
Q16                       int64
Q17                       int64
Q18                      object
dtype: object

In [58]:
# Clean the train dataset
train_data_cleaned = train_data.drop('id_num', axis=1)
train_data_cleaned = train_data_cleaned.dropna()

train_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Q1                     169 non-null    object
 1   Q2                     169 non-null    int64 
 2   political_affiliation  169 non-null    object
 3   Q4                     169 non-null    object
 4   Q5                     169 non-null    object
 5   Q6                     169 non-null    object
 6   Q7                     169 non-null    object
 7   Q8                     169 non-null    object
 8   Q9                     169 non-null    object
 9   Q10                    169 non-null    object
 10  Q11                    169 non-null    object
 11  Q12                    169 non-null    object
 12  Q13                    169 non-null    object
 13  Q14                    169 non-null    object
 14  Q15                    169 non-null    int64 
 15  Q16                    

# Decision Tree Model

In [59]:
# Splitting the dataset into features (X) and target (y)
X = train_data_cleaned.drop('political_affiliation', axis=1)
y = train_data_cleaned['political_affiliation']

# Identifying categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Creating a column transformer for one-hot encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Creating a pipeline that first one-hot encodes the data then fits a Decision Tree model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier(random_state=0))])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fitting the pipeline to the training data
dt_pipeline_fitted = pipeline.fit(X_train, y_train)

In [60]:
y_pred = dt_pipeline_fitted.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy

0.5294117647058824

# KNN Model

In [61]:
# Creating a KNN model
knn_model = KNeighborsClassifier()

# Creating a pipeline for the KNN model
knn_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', knn_model)])

# Fitting the pipeline to the training data
knn_pipeline.fit(X_train, y_train)

# Making predictions on the test set
y_pred = knn_pipeline.predict(X_test)

# Calculating the accuracy
accuracy = accuracy_score(y_test, y_pred)

# Displaying the accuracy
accuracy

0.6176470588235294

# Prediction

In [62]:
test_data = pd.read_csv("/Users/ruojiakuang/Desktop/GSB S544 Computing and Machine Learning for Business Analytics/FInal Code/gsb-544-fall-2023-political-affiliations/CAH-201803-test.csv")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id_num  166 non-null    int64 
 1   Q1      166 non-null    object
 2   Q2      166 non-null    int64 
 3   Q4      166 non-null    object
 4   Q5      166 non-null    object
 5   Q6      166 non-null    object
 6   Q7      166 non-null    object
 7   Q8      166 non-null    object
 8   Q9      166 non-null    object
 9   Q10     166 non-null    object
 10  Q11     166 non-null    object
 11  Q12     166 non-null    object
 12  Q13     166 non-null    object
 13  Q14     166 non-null    object
 14  Q15     166 non-null    int64 
 15  Q16     166 non-null    int64 
 16  Q17     166 non-null    int64 
 17  Q18     166 non-null    object
dtypes: int64(5), object(13)
memory usage: 23.5+ KB


In [63]:
# Clean the dataset
test_data_cleaned = test_data.drop(columns=['id_num']).dropna()

X_test = train_data_cleaned.drop('political_affiliation', axis=1)
y_pred = knn_pipeline.predict(test_data_cleaned)

In [64]:
# Creating a new DataFrame for submission with test data
submission = pd.DataFrame()
submission['id_num'] = test_data['id_num']
submission['political_affiliation_predicted'] = y_pred

# Displaying the submission DataFrame with test data
submission.head()

Unnamed: 0,id_num,political_affiliation_predicted
0,2,Republican
1,3,Independent
2,4,Republican
3,6,Independent
4,11,Independent


In [65]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 2 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   id_num                           166 non-null    int64 
 1   political_affiliation_predicted  166 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.7+ KB


In [66]:
submission.to_csv('submission2_2.csv', index=False)