In [27]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 1. Summarize the data

In [28]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# summary of the train data
print(train_df[['feature_1', 'feature_2', 'feature_3', 'feature_4']].describe())
# summary of the test data
print(test_df[['feature_1', 'feature_2', 'feature_3', 'feature_4']].describe())

         feature_1    feature_2    feature_3    feature_4
count  6000.000000  6000.000000  6000.000000  6000.000000
mean      7.990044     5.175071     9.982420    31.074333
std       8.002022     3.770585    10.200213    14.588738
min       0.000556     0.034444     0.502000     1.000000
25%       1.508611     1.531111     3.352750    16.000000
50%       5.027917     4.742500     6.563500    37.000000
75%      10.893819     8.357222    13.073750    43.000000
max      23.999444    23.838333    77.700000    52.000000
        feature_1   feature_2   feature_3   feature_4
count  800.000000  800.000000  800.000000  800.000000
mean    10.133354    4.858312    9.544297   29.768750
std      8.999654    3.644326   10.092338    8.211246
min      0.004722    0.111944    0.526000   12.000000
25%      2.022569    1.603819    2.379750   23.000000
50%      7.044444    4.086944    6.087500   28.000000
75%     22.153333    7.907361   12.688000   36.000000
max     23.999722   23.621944   57.748000   44

## 2. kick out the outliers that is not in the test set

In [29]:
train_df = train_df[train_df['feature_4'].between(15, 40)]
# train_df = train_df[train_df['feature_3'].between(0.5, 58)]
train_df = train_df[train_df['feature_2'].between(0.1, 23)]

train_df.describe()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,label,example_id
count,2410.0,2410.0,2410.0,2410.0,2410.0,2410.0
mean,8.166595,5.521854,10.091701,30.687552,0.723651,2996.648548
std,8.361535,3.678301,10.136648,8.592227,0.447284,1741.02546
min,0.000556,0.102778,0.521,15.0,0.0,0.0
25%,1.323125,1.959722,3.66,21.0,0.0,1503.5
50%,4.875833,5.449028,6.5755,34.0,1.0,2987.5
75%,11.376458,8.59375,13.33,38.0,1.0,4523.75
max,23.997778,22.632778,77.7,40.0,1.0,5996.0


## 3. Normalize both train and test data

In [30]:
# train_df['feature_1'] = (train_df['feature_1']-train_df['feature_1'].min())/train_df['feature_1'].max()
# train_df['feature_2'] = (train_df['feature_2']-train_df['feature_2'].min())/train_df['feature_2'].max()
# train_df['feature_3'] = (train_df['feature_3']-train_df['feature_3'].min())/train_df['feature_3'].max()
# train_df['feature_4'] = (train_df['feature_4']-train_df['feature_4'].min())/train_df['feature_4'].max()
train_df['feature_1'] = train_df['feature_1']/train_df['feature_1'].max()
train_df['feature_2'] = train_df['feature_2']/train_df['feature_2'].max()
train_df['feature_3'] = train_df['feature_3']/train_df['feature_3'].max()
train_df['feature_4'] = train_df['feature_4']/train_df['feature_4'].max()

# test_df['feature_1'] = (test_df['feature_1']-test_df['feature_1'].min())/test_df['feature_1'].max()
# test_df['feature_2'] = (test_df['feature_2']-test_df['feature_2'].min())/test_df['feature_2'].max()
# test_df['feature_3'] = (test_df['feature_3']-test_df['feature_3'].min())/test_df['feature_3'].max()
# test_df['feature_4'] = (test_df['feature_4']-test_df['feature_4'].min())/test_df['feature_4'].max()
test_df['feature_1'] = test_df['feature_1']/test_df['feature_1'].max()
test_df['feature_2'] = test_df['feature_2']/test_df['feature_2'].max()
test_df['feature_3'] = test_df['feature_3']/test_df['feature_3'].max()
test_df['feature_4'] = test_df['feature_4']/test_df['feature_4'].max()

# to numpy array
train_data = train_df.values
test_data = test_df.values

## 4. train validation split

In [31]:
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=1)
X_train = train_data[:, :4]
y_train = train_data[:, 4]
X_val = val_data[:, :4]
y_val = val_data[:, 4]

## Support Vector Machine

In [32]:
model_1 = svm.SVC(kernel='linear', probability=True)
model_1.fit(X_train, y_train)
yhat = model_1.predict(X_val)
acc = accuracy_score(y_val, yhat)
print('Accuracy: %.3f' % acc)

Accuracy: 0.975


In [33]:
# now use the trained model to predict the test data
X_test = test_data[:, :4]
yhat = model_1.predict(X_test)

# save the result
result = pd.DataFrame({'example_id': test_data[:, 4].astype(int), 'prediction': yhat.astype(int)})
result.to_csv('SVM.csv', index=False)

## Now train multiple models and ensemble them

In [34]:
def train_svm(train_data, tol=1e-3, C=1.0, max_iter=1000, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, random_state=None):
    train_data, val_data = train_test_split(train_data, test_size=0.2)
    X_train = train_data[:, :4];    y_train = train_data[:, 4]
    X_val = val_data[:, :4];        y_val = val_data[:, 4]
    model = svm.SVC(kernel=kernel, tol=tol, C=C, max_iter=max_iter, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, random_state=random_state)
    model.fit(X_train, y_train)
    yhat = model.predict(X_val)
    acc = accuracy_score(y_val, yhat)
    print('Accuracy: %.3f' % acc)
    return model

# train KNN

def train_knn(train_data, n_neighbors, weights):
    train_data, val_data = train_test_split(train_data, test_size=0.2)
    X_train = train_data[:, :4];    y_train = train_data[:, 4]
    X_val = val_data[:, :4];        y_val = val_data[:, 4]
    model = KNeighborsClassifier(n_neighbors, weights=weights)
    model.fit(X_train,y_train)
    yhat = model.predict(X_val)
    acc = accuracy_score(y_val, yhat)
    print('Accuracy: %.3f' % acc)
    return model

In [35]:
model_list = []

for tol in [1e-2, 1e-3, 1e-4]:
    for c in [0.5, 0.6, 0.8, 1]:
        model_list.append(train_svm(train_data, tol=tol, C=c, kernel='linear'))
        model_list.append(train_svm(train_data, tol=tol, C=c, kernel='poly'))
        model_list.append(train_svm(train_data, tol=tol, C=c, kernel='poly', degree=2))
        model_list.append(train_svm(train_data, tol=tol, C=c, kernel='rbf'))
        model_list.append(train_svm(train_data, tol=tol, C=c, kernel='sigmoid'))

# for n in [4,5,6]:
#     for weights in ['uniform', 'distance']:
#         model_list.append(train_knn(train_data, n, weights))


# model_2 = train_svm(train_data, tol=1e-3, C=1.0, max_iter=1000, kernel='linear', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, random_state=None)
# model_3 = train_svm(train_data, tol=1e-3, C=1.0, max_iter=1000, kernel='poly', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, random_state=None)
# model_4 = train_svm(train_data, tol=1e-3, C=1.0, max_iter=1000, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, random_state=None)
# model_5 = train_svm(train_data, tol=1e-3, C=1.0, max_iter=1000, kernel='sigmoid', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=False, random_state=None)

# model_list = [model_1, model_2, model_3, model_4, model_5]


Accuracy: 0.990
Accuracy: 0.984
Accuracy: 0.964
Accuracy: 0.979
Accuracy: 0.974
Accuracy: 0.982
Accuracy: 0.961
Accuracy: 0.966
Accuracy: 0.990
Accuracy: 0.979
Accuracy: 0.979
Accuracy: 0.959
Accuracy: 0.977
Accuracy: 0.977
Accuracy: 0.979
Accuracy: 0.990
Accuracy: 0.982
Accuracy: 0.979
Accuracy: 0.977
Accuracy: 0.984
Accuracy: 0.984
Accuracy: 0.987
Accuracy: 0.966
Accuracy: 0.990
Accuracy: 0.990
Accuracy: 0.979
Accuracy: 0.966
Accuracy: 0.959
Accuracy: 0.990
Accuracy: 0.982
Accuracy: 0.982
Accuracy: 0.961
Accuracy: 0.977
Accuracy: 0.974
Accuracy: 0.982
Accuracy: 0.979
Accuracy: 0.961
Accuracy: 0.964
Accuracy: 0.977
Accuracy: 0.987
Accuracy: 0.977
Accuracy: 0.992
Accuracy: 0.979
Accuracy: 0.990
Accuracy: 0.977
Accuracy: 0.982
Accuracy: 0.969
Accuracy: 0.966
Accuracy: 0.982
Accuracy: 0.984
Accuracy: 0.979
Accuracy: 0.969
Accuracy: 0.966
Accuracy: 0.969
Accuracy: 0.966
Accuracy: 0.990
Accuracy: 0.977
Accuracy: 0.972
Accuracy: 0.984
Accuracy: 0.977


## Ensemble!

In [36]:
X_test = test_data[:, :4]
y_list = []
for model in model_list:
    yhat = model.predict(X_test)
    y_list.append(yhat)

y_list = np.array(y_list)
y_list = y_list.T
y_list = y_list.tolist()

yhat = []
for y in y_list:
    # vote
    yhat.append( int(max(set(y), key=y.count)) )

# save the result
result = pd.DataFrame({'example_id': test_data[:, 4].astype(int), 'prediction': yhat})
result.to_csv('SVM_ensemble2.csv', index=False)
print('Done!')

Done!
