# CLASSIFICATION

In [2]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import time


In [3]:
train_set = pd.read_csv("data/train_set.txt",sep = ' ')
test_set = pd.read_csv("data/test_set.txt",sep = ' ')

In [4]:
X_train = pd.DataFrame(train_set)
del X_train['rain']
del X_train['rain_class']
del X_train['rain_log']

X_test = pd.DataFrame(test_set)
del X_test['rain']
del X_test['rain_class']
del X_test['rain_log']

In [5]:
Yr_train = train_set['rain']
Yr_test = test_set['rain']

In [6]:
Yb_train = train_set['rain_class']
Yb_test = test_set['rain_class']


## 1. K nearest neighbors

The completeness parameter `k` is optimised on a predefined grid by minimising the estimated error by cross-validation; scikit-learn offers many cross-validation options. 

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# Optimisation of k
param_grid=[{"n_neighbors":list(range(1,15))}]
knn=GridSearchCV(KNeighborsClassifier(),param_grid,cv=10,n_jobs=-1)
knnOpt=knn.fit(X_train, Yb_train)  
# optimal parameter
knnOpt.best_params_["n_neighbors"]
print("Best score = %f, Best parameter = %s" % (1.-knnOpt.best_score_,knnOpt.best_params_))

Best score = 0.469091, Best parameter = {'n_neighbors': 12}


In [8]:
# Estimation of the prediction error on the test sample
1-knnOpt.score(X_test,Yb_test)

0.536231884057971

In [13]:
# Prediction of the test sample
from sklearn.metrics import accuracy_score
y_chap = knnOpt.predict(X_test)
print("Accuracy score =", accuracy_score(Yb_test, y_chap))

# confusion matrix
table=pd.crosstab(y_chap,Yb_test)
print("Confusion matrix")
print(table)

Accuracy score = 0.463768115942029
Confusion matrix
rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          21        14       14
low_rain           12        33       23
no_rain             0        11       10


## 2. Decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
# Optimisation of shaft depth
param=[{"max_depth":list(range(2,10))}]
tree= GridSearchCV(DecisionTreeClassifier(),param,cv=10,n_jobs=-1)
treeOpt=tree.fit(X_train, Yb_train)
# Optimal parameter
print("Best score = %f, Best parameter = %s" % (1. - treeOpt.best_score_,treeOpt.best_params_))

Best score = 0.500000, Best parameter = {'max_depth': 3}


In [11]:
# Estimation of the prediction error on the test sample
1-treeOpt.score(X_test,Yb_test)

0.48550724637681164

In [12]:
# Prediction of the test sample
y_chap = treeOpt.predict(X_test)
# confusion matrix
table=pd.crosstab(y_chap,Yb_test)
print(table)

rain_class  high_rain  low_rain  no_rain
row_0                                   
high_rain          11         5        4
low_rain           19        46       29
no_rain             3         7       14


## 3. Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier 
# Parameters' definitions
forest = RandomForestClassifier(n_estimators=500, 
   criterion='gini', max_depth=None,
   min_samples_split=2, min_samples_leaf=1, 
   max_features='auto', max_leaf_nodes=None,
   bootstrap=True, oob_score=True)
# Training
rfFit = forest.fit(X_train,Yb_train)
# Out-of-bag error on the train sample
print(1-rfFit.oob_score_)
# Out-of-bag error on the test sample
print(1-rfFit.score(X_test,Yb_test))

0.46545454545454545
0.5072463768115942


Optimisation by cross validation of `max_features`

In [21]:
param=[{"max_features":list(range(2,10,1))}]
rf= GridSearchCV(RandomForestClassifier(n_estimators=100),
        param,cv=5,n_jobs=-1)
rfOpt=rf.fit(X_train, Yb_train)
print("Best score = %f, Best parameter = %s" % (1. - rfOpt.best_score_,rfOpt.best_params_))

Best score = 0.463636, Best parameter = {'max_features': 2}
