In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics

# Data

In [2]:
X_test = np.loadtxt("X_test.csv", delimiter=",", dtype=float)
X_train = np.loadtxt("X_train.csv", delimiter=",", dtype=float)
y_test = np.loadtxt("y_test.csv", delimiter=",", dtype=float)
y_train = np.loadtxt("y_train.csv", delimiter=",", dtype=float)

In [3]:
for elm in [X_test, X_train, y_test, y_train]:
    print(elm.shape)

(13901, 16)
(33724, 16)
(13901,)
(33724,)


In [4]:
unique, frequency = np.unique(y_train, return_counts = True)
length = len(y_train)
freqLst = []
for elm in frequency:
    freqLst.append(elm/length)

print(list(unique))
print(freqLst)

[0.0, 1.0, 2.0, 3.0, 4.0]
[0.5208753410034397, 0.09551061558533981, 0.25272802751749496, 0.04693986478472305, 0.08394615110900248]


### Datasplit

In [5]:
print(np.shape(X_train))
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)
print(np.shape(x_train), np.shape(x_val), np.shape(y_train), np.shape(y_val))

(33724, 16)
(26979, 16) (6745, 16) (26979,) (6745,)


# Models

### Multi-nominal logistic regression.

In [6]:
warnings.filterwarnings('ignore') 
log_model_train = LogisticRegression(multi_class = 'multinomial') 
log_model_train.fit(x_train, y_train) 
y_pred_val = log_model_train.predict(x_val) 
y_pred_test = log_model_train.predict(X_test) 
warnings.filterwarnings('default')

In [7]:
print("Logistic Regression 0-1 loss:",metrics.zero_one_loss(y_val, y_pred_val))
print("Logistic Regression 0-1 loss:",metrics.zero_one_loss(y_test, y_pred_test))

Logistic Regression 0-1 loss: 0.1543365455893254
Logistic Regression 0-1 loss: 0.09884180994173086


### Random forest

In [8]:
forest_model_train_50 = RandomForestClassifier(max_depth = 50)
forest_model_train_100 = RandomForestClassifier(max_depth = 100)
forest_model_train_200 = RandomForestClassifier(max_depth = 200)
forest_model_train_50.fit(x_train, y_train) 
forest_model_train_100.fit(x_train, y_train) 
forest_model_train_200.fit(x_train, y_train) 
y_pred_val_50 = forest_model_train_50.predict(x_val) 
y_pred_val_100 = forest_model_train_100.predict(x_val) 
y_pred_val_200 = forest_model_train_200.predict(x_val) 
y_pred_test_50 = forest_model_train_50.predict(X_test) 
y_pred_test_100 = forest_model_train_100.predict(X_test) 
y_pred_test_200 = forest_model_train_200.predict(X_test) 

In [9]:
print("Forest: 50 trees on val   :",metrics.zero_one_loss(y_val, y_pred_val_50))
print("Forest: 100 trees on val  :",metrics.zero_one_loss(y_val, y_pred_val_100))
print("Forest: 200 trees on val  :",metrics.zero_one_loss(y_val, y_pred_val_200))
print("Forest: 50 trees on train :",metrics.zero_one_loss(y_test, y_pred_test_50))
print("Forest: 100 trees on train:",metrics.zero_one_loss(y_test, y_pred_test_100))
print("Forest: 200 trees on train:",metrics.zero_one_loss(y_test, y_pred_test_200))

Forest: 50 trees on val   : 0.1519644180874722
Forest: 100 trees on val  : 0.1512231282431431
Forest: 200 trees on val  : 0.14988880652335068
Forest: 50 trees on train : 0.11286957772822104
Forest: 100 trees on train: 0.11157470685562187
Forest: 200 trees on train: 0.11078339687792249


### k-nearest-neighbor classification

In [10]:
k_range = range(1, 31, 3)
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, x_train, y_train, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())
print('Score by k :', k_scores[:5])

for i in range(len(k_scores)):
    if k_scores[i] == max(k_scores):
        print('Index :', i,'\nScore :', max(k_scores))

Score by k : [0.8060712600072412, 0.8354275126393697, 0.8437302337021004, 0.8478815942334658, 0.8485858058452662]
Index : 9 
Score : 0.8514028790076198


In [11]:
knn_model_val = KNeighborsClassifier(n_neighbors=9).fit(x_train, y_train) 
y_pred_val = log_model_train.predict(x_val)
y_pred_test = log_model_train.predict(X_test)

In [12]:
print("Logistic Regression 0-1 loss on validation:",metrics.zero_one_loss(y_val, y_pred_val))
print("Logistic Regression 0-1 loss on validation:",metrics.zero_one_loss(y_test, y_pred_test))

Logistic Regression 0-1 loss on validation: 0.1543365455893254
Logistic Regression 0-1 loss on validation: 0.09884180994173086
