Luke Crump

## Import Libraries

In [35]:
# Preprocessing/Data manipulation
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


## Read CSVs

In [36]:
answer = pd.read_csv('sample_submission.csv')

test_data = pd.read_csv('test.csv')

train_data = pd.read_csv('train.csv')

In [37]:
train_data['label'].value_counts()

B    285
M    170
Name: label, dtype: int64

## Fixing Labeling

Setting labeling to be 0 or 1 instead of M or B because it must be 0 or 1 to work with the classifier algorithms.

In [38]:
encoder = LabelEncoder()

answer['label'] = encoder.fit_transform(answer['label'])
answer['label']

0      1
1      1
2      0
3      1
4      1
      ..
109    0
110    0
111    0
112    0
113    0
Name: label, Length: 114, dtype: int32

## Splitting Dataset & Feature Scaling

Splitting the training data and assigning the test data, also scaling the data for digestion through the classifier algorithms.

In [39]:
# Training data
X_train = train_data.drop(columns =['label'])
y_train = train_data['label']



# Test Data
X_test = test_data

sc = StandardScaler()

X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

## Perceptron
From here on out I'm just creating the classifier object and training it on the train data. Then the classifier is getting fed the test data and the answers are recorded in a csv that has a shortened name of the classifier.

In [40]:
perceptron = Perceptron()

perceptron.fit(X_train_scaled, y_train)

prediction = perceptron.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('percept.csv', index = False)

## Logistic Regression

In [41]:
log_reg= LogisticRegression()

log_reg.fit(X_train_scaled, y_train)

prediction = log_reg.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('log_reg.csv', index = False)


## SVM
I used this for my Kaggle submission.

In [42]:
svc = SVC()

svc.fit(X_train_scaled, y_train)

prediction = svc.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('svm.csv', index = False)


## Decision Tree Classifier

In [43]:
dtc = DecisionTreeClassifier()

dtc.fit(X_train_scaled, y_train)

prediction = dtc.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('dec_tree.csv', index = False)

## KNN

In [44]:
knn = KNeighborsClassifier()

knn.fit(X_train_scaled, y_train)

prediction = knn.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('knn.csv', index = False)

## Random Forest Classifier

In [45]:
rfc = RandomForestClassifier()

rfc.fit(X_train_scaled, y_train)

prediction = rfc.predict(X_test_scaled)
output_df = pd.DataFrame({'id': answer.id, 'label': prediction})
output_df.to_csv('rfc.csv', index = False)

## Finding Best Classifier
Below I am running each classifier by splitting the training data and running them to get an idea on which classifier performs best.

In [46]:
X = train_data.drop('label', axis = 1).values
y = train_data['label'].values

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

sc = StandardScaler()

X_train = sc.fit_transform(x_train)
X_test= sc.transform(x_test)



In [47]:
perceptron = Perceptron()

perceptron.fit(X_train, y_train)

prediction = perceptron.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.9560439560439561


In [48]:
log_reg= LogisticRegression()

log_reg.fit(X_train, y_train)

prediction = log_reg.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.978021978021978


Chose SVC for kaggle submission due to its high score.

In [49]:
svc = SVC()

svc.fit(X_train, y_train)

prediction = svc.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.989010989010989


In [50]:
dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)

prediction = dtc.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.9120879120879121


In [51]:
knn = KNeighborsClassifier()

knn.fit(X_train, y_train)

prediction = knn.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.989010989010989


In [52]:
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)

prediction = rfc.predict(X_test)
print(f'Accuracy : {accuracy_score(prediction, y_test)}')

Accuracy : 0.967032967032967
