In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from kernels import Kernel
import svm
import time
import copy


def accuracy(x, y):
    count = 0
    for i in range(len(x)):
        if x[i] == y[i]:
            count += 1
    return count/len(x)


train = pd.read_csv('/Users/grigoriipogorelov/Desktop/train.csv')
train['Date'] = pd.to_datetime(train['Dates'], errors='coerce')
train['Year'] = train['Date'].dt.year
train['Month'] = train['Date'].dt.month
train['WeekOfYear'] = train['Date'].dt.weekofyear
train['Day'] = train['Date'].dt.day
train['Time'] = train['Date'].dt.time
train['Hour'] = train['Date'].dt.hour
train = train.drop(['Dates','Date','Time','Resolution','Descript','Address','WeekOfYear'], axis=1)

cat_cols = ['DayOfWeek','PdDistrict', 'Year', 'Month', 'Day', 'Hour']
for cat in cat_cols:
    train = pd.get_dummies(train, columns=[cat])

train_len = len(train)
train_proportion = 0.02
train = train.sample(frac=1)
train = train.iloc[:int(len(train)*train_proportion)]
copy_ = copy.copy(train)
test = copy_.iloc[int(len(train)*0.8):]
train = train.iloc[:int(len(train)*0.8)]

categories = train.Category.unique()
category_len = len(train['Category'].unique())
samples = []
for i in range(category_len - 1):
    for j in range(i + 1, category_len):
        samples.append(categories[i] + '+' + categories[j])

train_sets = []
for sample in samples:
    cats = sample.split('+')
    train_0 = train.loc[train['Category'] == cats[0]]
    train_1 = train.loc[train['Category'] == cats[1]]
    frames = [train_0, train_1]
    train_sets.append(pd.concat(frames))

print(train.shape)

(14048, 100)


In [4]:
start = time.time()

models = []
cat_to_number_each_model = []
C = 0.1
for i, data in enumerate(train_sets):
    _d = {}
    options = data['Category'].unique()
    data = data.as_matrix()
    data[data == options[0]] = 1
    _d['1'] = options[0]
    data[data == options[1]] = -1
    _d['-1'] = options[1]
    data = np.array(data, dtype='float')
    trainer = svm.SVMTrainer(kernel=Kernel.linear(), c=C)
    model = trainer.train(data[:,1:], data[:,0])
    models.append(model)
    cat_to_number_each_model.append(_d)
    print()
    print('MODEL '+str(i)+' finished')
    print()

end = time.time()
print(int((end - start)/60), ' minutes for training')

     pcost       dcost       gap    pres   dres
 0: -8.5317e+02 -5.2060e+02  2e+04  3e+01  6e-09
 1: -1.2815e+02 -4.9656e+02  8e+02  8e-01  5e-09
 2: -1.0628e+02 -2.1126e+02  1e+02  2e-14  9e-10
 3: -1.1251e+02 -1.4037e+02  3e+01  2e-14  8e-10
 4: -1.1427e+02 -1.1849e+02  4e+00  6e-15  8e-10
 5: -1.1456e+02 -1.1488e+02  3e-01  3e-15  9e-10
 6: -1.1460e+02 -1.1463e+02  4e-02  7e-14  9e-10
 7: -1.1460e+02 -1.1460e+02  1e-03  1e-13  9e-10
 8: -1.1460e+02 -1.1460e+02  1e-05  6e-14  9e-10
Optimal solution found.

MODEL 0 finished

     pcost       dcost       gap    pres   dres
 0: -2.9203e+02 -3.7938e+02  1e+04  3e+01  1e-09
 1: -4.1881e+01 -3.5300e+02  7e+02  8e-01  1e-09
 2: -3.0382e+01 -1.0136e+02  7e+01  2e-15  2e-10
 3: -3.1533e+01 -4.5783e+01  1e+01  1e-14  2e-10
 4: -3.2070e+01 -3.4239e+01  2e+00  1e-14  2e-10
 5: -3.2191e+01 -3.2359e+01  2e-01  1e-14  2e-10
 6: -3.2198e+01 -3.2225e+01  3e-02  3e-15  2e-10
 7: -3.2200e+01 -3.2203e+01  3e-03  3e-14  2e-10
 8: -3.2200e+01 -3.2200e+01 

KeyboardInterrupt: 

In [33]:
labels = test['Category'].as_matrix()
test = test.as_matrix()
test = np.array(test[:, 1:], dtype='float')

start = time.time()

# should stick to 'samples' order
predictions_for_each_model = []
for model in models:
    pred = []
    for t in test:
        pred.append(model.predict(t))
    predictions_for_each_model.append(pred)

end = time.time()
print(int((end - start) / 60), ' minutes for predicting')


count_for_categories = {}
for cat in categories:
    count_for_categories[cat] = 0


combo_predictions = []
pred_len = len(predictions_for_each_model[0])
# for every prediction of a model
for i in range(pred_len):
    count_cat = copy.deepcopy(count_for_categories)
    # take a particular model
    for j, preds in enumerate(predictions_for_each_model):
        count_cat[cat_to_number_each_model[j][str(int(preds[i]))]] += 1
    winner = max(count_cat, key=count_cat.get)
    combo_predictions.append(winner)

acc = accuracy(labels, combo_predictions)
print('accuracy is ', acc)

2  minutes for predicting
accuracy is  0.04065040650406504
