In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix

def accuracy_score(y_true, y_pred):
    return (y_true == y_pred).mean()

# Data reading
data_train_samples = pd.read_csv('data/train_samples.txt', sep="	", header=None)
data_train_labels = pd.read_csv('data/train_labels.txt', sep="	", header=None)


data_validation_samples = pd.read_csv('data/validation_samples.txt', sep="	", header=None)
data_validation_labels = pd.read_csv('data/validation_labels.txt', sep="	", header=None)


data_test_samples = pd.read_csv('data/test_samples.txt', sep="	", header=None)


train_ids = data_train_samples[0]
train_data = data_train_samples[1]
train_labels = data_train_labels[1]

validation_ids = data_validation_samples[0]
validation_data = data_validation_samples[1]
validation_labels = data_validation_labels[1]

test_ids = data_test_samples[0]
test_data = data_test_samples[1]

ytrain = train_labels.astype('int')
yvalidation = validation_labels.astype('int')


# Text preprocessing and scaling
cv = CountVectorizer(encoding = 'str', strip_accents = 'unicode')

xtrain = cv.fit_transform(train_data)
xvalidation = cv.transform(validation_data)
xtest = cv.transform(test_data)

xtrain = xtrain.toarray()
xvalidation = xvalidation.toarray()
xtest = xtest.toarray()

print(xtrain)
print(len(xtrain))
print(len(xtrain[0]))


# Model
alphaList = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001]
print('--------------')
for alphaValue in alphaList:
    pr = Perceptron(alpha = alphaValue, penalty='l2')
    pr.fit(xtrain, ytrain)
    predicted = pr.predict(xvalidation)
    print(alphaValue, accuracy_score(predicted, yvalidation), sep = " ")
print('--------------')
pr = Perceptron(alpha = 0.000001, penalty='l2')
pr.fit(xtrain, ytrain)
predicted = pr.predict(xvalidation)
print(0.000001, accuracy_score(predicted, yvalidation), sep = " ")

cm = confusion_matrix(yvalidation, predicted)
print(cm)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
10000
22029
--------------
0.1 0.4014
0.01 0.4098
0.001 0.4642
0.0001 0.574
1e-05 0.536
1e-06 0.6398
--------------
1e-06 0.6398
[[1172  488  340]
 [ 333  945  222]
 [ 218  200 1082]]
