In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time
import random
import os

from sklearn.metrics import accuracy_score

In [2]:
def seed_everything(seed=42):
    """"Seed everything.
    """   
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [3]:
train_set = pd.read_csv("data/train_set.txt",sep = ' ')
test_set = pd.read_csv("data/test_set.txt",sep = ' ')

In [4]:
X_train = pd.DataFrame(train_set).copy()
del X_train['rain']
del X_train['rain_class']
del X_train['rain_log']

X_test = pd.DataFrame(test_set).copy()
del X_test['rain']
del X_test['rain_class']
del X_test['rain_log']

In [5]:
train_set['rain_class'] = train_set['rain_class'].astype("category")
test_set['rain_class'] = test_set['rain_class'].astype("category")

In [6]:
Y_train = train_set['rain']
Y_test = test_set['rain']

In [7]:
Y_train_class = train_set['rain_class']
Y_test_class = test_set['rain_class']


## 4. Support Vector Machine

### 4.1. Linear SVM 

Optimisation of C - Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty

In [15]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
param=[{"C":[0.4,0.5,0.6,0.8,1,1.4]}]
svm= GridSearchCV(LinearSVC(),param,cv=10,n_jobs=-1,scoring = "accuracy")
svmLinOpt=svm.fit(X_train, Y_train_class)
print("Best Mean cross-validated accuracy = %f, Best parameter = %s" % (svmLinOpt.best_score_,svmLinOpt.best_params_))

Best Mean cross-validated accuracy = 0.530909, Best parameter = {'C': 0.4}


In [16]:
# Prediction of the test sample
y_hat_class = svmLinOpt.predict(X_test)
print("Accuracy score =", accuracy_score(y_true = Y_test_class, y_pred = y_hat_class))
# confusion matrix
table = pd.crosstab(y_hat_class, Y_test_class)
print(table)

Accuracy score = 0.572463768115942
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          24        11        9
low_rain            8        36       19
no_rain             1        11       19


### 4.2. SVM with polynomial kernels

By default, we take polynomial of degree 3

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param=[{"C":[0.1,0.4,0.5,0.6,0.8,1,1.2,1.4,1.6,2],"gamma":np.array(range(1,11))/100, "coef0":np.array(range(1,11))/10}]
svm= GridSearchCV(SVC(kernel="poly"),param,cv=10,n_jobs=-1,scoring = "accuracy")
svmPolyOpt=svm.fit(X_train, Y_train_class)
print("Best Mean cross-validated accuracy = %f, Best parameter = %s" % (svmPolyOpt.best_score_,svmPolyOpt.best_params_))

Best Mean cross-validated accuracy = 0.543636, Best parameter = {'C': 1, 'coef0': 0.9, 'gamma': 0.04}


In [25]:
# Prediction of the test sample
y_hat_class = svmPolyOpt.predict(X_test)
print("Accuracy score =", accuracy_score(y_true = Y_test_class, y_pred = y_hat_class))
# confusion matrix
table = pd.crosstab(y_hat_class, Y_test_class)
print(table)

Accuracy score = 0.5217391304347826
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          20        10        7
low_rain           13        38       26
no_rain             0        10       14


Let's test for degree 2

In [26]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param=[{"C":[0.1,0.4,0.5,0.6,0.8,1,1.2,1.4,1.6,2],"gamma":np.array(range(1,11))/100, "coef0":np.array(range(1,11))/10}]
svm= GridSearchCV(SVC(kernel="poly",degree =2),param,cv=10,n_jobs=-1,scoring = "accuracy")
svmPoly2Opt=svm.fit(X_train, Y_train_class)
print("Best Mean cross-validated accuracy = %f, Best parameter = %s" % (svmPoly2Opt.best_score_,svmPoly2Opt.best_params_))

Best Mean cross-validated accuracy = 0.545455, Best parameter = {'C': 1, 'coef0': 0.9, 'gamma': 0.08}


In [27]:
# Prediction of the test sample
y_hat_class = svmPoly2Opt.predict(X_test)
print("Accuracy score =", accuracy_score(y_true = Y_test_class, y_pred = y_hat_class))
# confusion matrix
table = pd.crosstab(y_hat_class, Y_test_class)
print(table)

Accuracy score = 0.5072463768115942
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          20        12        8
low_rain           12        34       23
no_rain             1        12       16


### 4.3. SVM with radial kernel

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param=[{"C":[0.1,0.4,0.5,0.6,0.8,1,1.2,1.4,1.6,2],"gamma":np.array(range(1,11))/100, "coef0":np.array(range(1,11))/10}]
svm= GridSearchCV(SVC(kernel="rbf"),param,cv=10,n_jobs=-1,scoring = "accuracy")
svmRadOpt=svm.fit(X_train, Y_train_class)
print("Best Mean cross-validated accuracy = %f, Best parameter = %s" % (svmRadOpt.best_score_,svmRadOpt.best_params_))

Best Mean cross-validated accuracy = 0.567273, Best parameter = {'C': 1.2, 'coef0': 0.1, 'gamma': 0.1}


In [32]:
# Prediction of the test sample
y_hat_class = svmRadOpt.predict(X_test)
print("Accuracy score =", accuracy_score(y_true = Y_test_class, y_pred = y_hat_class))
# confusion matrix
table = pd.crosstab(y_hat_class, Y_test_class)
print(table)

Accuracy score = 0.5217391304347826
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          21         9       10
low_rain           11        40       26
no_rain             1         9       11


### 4.3. SVM with sigmoid kernel

In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param=[{"C":[0.1,0.4,0.5,0.6,0.8,1,1.2,1.4,1.6,2],"gamma":np.array(range(1,11))/100, "coef0":np.array(range(1,11))/10}]
svm= GridSearchCV(SVC(kernel="sigmoid"),param,cv=10,n_jobs=-1,scoring = "accuracy")
svmSigOpt=svm.fit(X_train, Y_train_class)
print("Best Mean cross-validated accuracy = %f, Best parameter = %s" % (svmSigOpt.best_score_,svmSigOpt.best_params_))

Best Mean cross-validated accuracy = 0.516364, Best parameter = {'C': 0.4, 'coef0': 0.1, 'gamma': 0.05}


In [35]:
# Prediction of the test sample
y_hat_class = svmSigOpt.predict(X_test)
print("Accuracy score =", accuracy_score(y_true = Y_test_class, y_pred = y_hat_class))
# confusion matrix
table = pd.crosstab(y_hat_class, Y_test_class)
print(table)

Accuracy score = 0.5217391304347826
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          26        12        8
low_rain            7        42       35
no_rain             0         4        4
