In [61]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import re
import itertools

In [62]:
def readDocument(fileName):
    X = []
    y = []
    with open(fileName, 'r') as file:   
        for row in file:
            dataset = row.split('\t')
            y.append(dataset[0])
            if fileName == 'train.dat':
                X.append(dataset[1])
        return X, y
trainX,trainY = readDocument('train.dat');
testX = readDocument('test.dat')[1];
print(len(trainX))
print(len(trainY))
print(len(testX))

800
800
350


In [63]:
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
NUMBER_OF_PARAMETERS = 1000000

def build_csc(lists):
    i = 0
    param_lists = []
    row_lists = []
    value_lists = []
    for list in lists:
        list = list.strip()
        params = [int(n) for n in list.split(' ')] # list with string to list of nums, index is +1
        row = [i] * len(params)
        value = [True] * len(params)
        param_lists.append(params)
        row_lists.append(row)
        value_lists.append(value)
        i += 1
    coo = create_coo(param_lists, row_lists, value_lists, i)
    return csc_matrix(coo)
def create_coo(param_lists, row_lists, value_lists, num_rows):
    # in create COO
    flattened_params = np.array(list(itertools.chain.from_iterable(param_lists)))
    flattened_rows = np.array(list(itertools.chain.from_iterable(row_lists)))
    flattened_values = np.array(list(itertools.chain.from_iterable(value_lists)))
    sparse_coo = coo_matrix((flattened_values, (flattened_rows, flattened_params)), #three 1D lists
                            shape=(num_rows, NUMBER_OF_PARAMETERS+1), #size of matrix, +1 bc of indexing,
                            dtype=np.bool)  # creates a boolean compressed sparse row matrix

    return sparse_coo

In [64]:
# Create CSR matrix of train
train_csc = build_csc(trainX)
train_csr = csc_matrix.tocsr(train_csc);

In [65]:
# l2 normalize train data
from sklearn.preprocessing import normalize
normTrain = normalize(train_csr);

In [66]:
# Applying Dimesionality Reduction on train
from sklearn.decomposition import TruncatedSVD
pca = TruncatedSVD(n_components=8, random_state=42)
XTrain = pca.fit_transform(normTrain.toarray());
print(XTrain);

[[ 0.16413083 -0.02458502 -0.02861892 ...,  0.06589946 -0.0123224
   0.02621486]
 [ 0.15794806 -0.04374366 -0.01324644 ..., -0.03844571 -0.01790843
   0.01515657]
 [ 0.15530871 -0.04291803 -0.02193003 ..., -0.00138607 -0.04488002
   0.05201919]
 ..., 
 [ 0.19181948  0.22907682 -0.05491128 ..., -0.04891582  0.17120991
   0.01849363]
 [ 0.15542383 -0.04145921 -0.01634521 ..., -0.02390336 -0.01516989
  -0.05902544]
 [ 0.1606224  -0.04787892 -0.0230894  ...,  0.00903995 -0.01697503
   0.03652064]]


In [67]:
# Create CSR matrix of test
test_csc = build_csc(testX)
test_csr = csc_matrix.tocsr(test_csc);

In [68]:
# l2 normalize test data
from sklearn.preprocessing import normalize
normTest = normalize(test_csr);

In [69]:
# Applying Dimesionality Reduction on test
XTest = pca.transform(normTest.toarray());

In [70]:
# classify test data
from sklearn.linear_model import Perceptron

clf = Perceptron()
clf = clf.fit(XTrain, trainY);

predication = []

prediction = clf.predict(XTest)
print(len(prediction))
print(prediction)

350
['0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '1' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '1' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0'
 '0' '1' '0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1' '1'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0

In [71]:
# write the predictions in file
output_file = open('data/out_trucatedSVD_Perceptron_8_42_re.dat', 'w')
row_count = 0
one_count = 0

for n in prediction:
    if n == '1':
        one_count+=1
        output_file.write('1\n')
    else:
        output_file.write('0\n')
    row_count+=1
    
print(row_count)
print(one_count)
output_file.close();

350
33


In [72]:
#Verify file written
with open("data/out_trucatedSVD_Perceptron_8_42_re.dat", "r") as output:
    output_data_lines = output.readlines()
print(len(output_data_lines))


350
