In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# read data
df = pd.read_csv("data/npf_train.csv")

In [3]:
df.head()

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,1,2000-02-23,nonevent,False,380.52812,0.802001,380.371466,0.88955,381.816207,1.292593,...,-10.730843,1.381815,-10.282754,1.870056,8.356761,4.534937,0.178084,0.123402,0.002546,0.000686
1,2,2000-03-25,Ib,False,373.128684,1.096617,372.98,1.04775,373.70183,1.259198,...,-2.095641,1.695622,-1.095864,2.090111,12.906779,7.0223,0.333523,0.239981,0.000662,0.00021
2,3,2000-04-06,Ib,False,372.363293,0.626329,372.245689,0.615803,372.847246,0.647279,...,0.991521,1.914186,1.846503,1.954748,14.286261,9.572444,0.418313,0.344386,0.000541,7.2e-05
3,4,2000-04-11,nonevent,False,381.437442,7.281159,381.380405,7.236002,381.926532,7.294374,...,1.753414,0.340565,2.524931,0.414255,4.945162,3.405652,0.224159,0.192014,0.00371,0.001209
4,5,2000-04-23,II,False,375.42631,3.264246,375.436524,3.110886,375.740215,3.274924,...,10.940107,2.179821,11.441893,3.048699,13.087014,9.771415,0.525591,0.476821,0.00368,0.00216


In [6]:
# delete some usuless columns
del df["date"]
del df["id"]
del df["partlybad"]
df.head()

Unnamed: 0,class4,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,nonevent,380.52812,0.802001,380.371466,0.88955,381.816207,1.292593,380.296466,0.968884,236.605353,...,-10.730843,1.381815,-10.282754,1.870056,8.356761,4.534937,0.178084,0.123402,0.002546,0.000686
1,Ib,373.128684,1.096617,372.98,1.04775,373.70183,1.259198,372.91,1.004164,252.480327,...,-2.095641,1.695622,-1.095864,2.090111,12.906779,7.0223,0.333523,0.239981,0.000662,0.00021
2,Ib,372.363293,0.626329,372.245689,0.615803,372.847246,0.647279,372.193952,0.596289,269.981547,...,0.991521,1.914186,1.846503,1.954748,14.286261,9.572444,0.418313,0.344386,0.000541,7.2e-05
3,nonevent,381.437442,7.281159,381.380405,7.236002,381.926532,7.294374,381.381156,7.208287,68.364653,...,1.753414,0.340565,2.524931,0.414255,4.945162,3.405652,0.224159,0.192014,0.00371,0.001209
4,II,375.42631,3.264246,375.436524,3.110886,375.740215,3.274924,375.337059,2.90378,242.192619,...,10.940107,2.179821,11.441893,3.048699,13.087014,9.771415,0.525591,0.476821,0.00368,0.00216


In [4]:
df['class4'].describe()

count          430
unique           4
top       nonevent
freq           215
Name: class4, dtype: object

In [5]:
df.class4.unique()

array(['nonevent', 'Ib', 'II', 'Ia'], dtype=object)

In [13]:
# select X and y variables

X = df.loc[:, df.columns != 'class4']
y = df['class4']
y = y.replace(['nonevent', 'Ia', 'Ib', 'II'],[0, 1, 2, 3])
y.head()

0    0
1    2
2    2
3    0
4    3
Name: class4, dtype: int64

In [58]:
# Split to training and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [59]:
# Normalize the feature set with standard scalar normalization - PCA is affected by scale

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
# apply PCA

from sklearn.decomposition import PCA

pca = PCA(random_state=None) # with number of components not specified, all features are returned
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# number of components
pca.n_components_

100

In [18]:
# variance caused by each of the principal components
pca.explained_variance_ratio_
# the three first ones seem to play the biggest role here

array([4.17961338e-01, 1.51169485e-01, 1.25822445e-01, 3.72130396e-02,
       3.22738816e-02, 2.62750379e-02, 2.51700328e-02, 2.18378722e-02,
       1.71222212e-02, 1.63132139e-02, 1.46531855e-02, 1.29622440e-02,
       1.23894777e-02, 1.06033160e-02, 9.13147867e-03, 8.27570967e-03,
       7.41455897e-03, 6.34612427e-03, 5.77803793e-03, 5.34477101e-03,
       5.09482021e-03, 4.24034155e-03, 3.61110627e-03, 3.19112761e-03,
       2.90917752e-03, 2.41078538e-03, 2.02238013e-03, 1.53586073e-03,
       1.45487617e-03, 1.34008403e-03, 1.25421861e-03, 1.05914705e-03,
       1.02496892e-03, 6.84969547e-04, 6.43907395e-04, 4.89038814e-04,
       4.38409833e-04, 3.86048160e-04, 2.91104575e-04, 2.34788507e-04,
       2.24292915e-04, 1.82089649e-04, 1.40694360e-04, 1.30544797e-04,
       1.12468042e-04, 9.53117385e-05, 8.91595792e-05, 8.45906752e-05,
       6.88045401e-05, 5.65891049e-05, 5.38991934e-05, 5.00808873e-05,
       4.12272037e-05, 3.58382258e-05, 3.12644534e-05, 2.53554625e-05,
      

In [60]:
from sklearn.decomposition import PCA

pca = PCA(n_components=14, svd_solver='auto', random_state=None) # with number of components not specified, all features are returned
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [61]:
# SVC classification

from sklearn.svm import SVC
from sklearn import metrics 

clf_svc = SVC(kernel = 'rbf', probability = True, random_state=0)
clf_svc.fit(X_train, y_train)
y_pred_svc = clf_svc.predict(X_test)
ac = metrics.accuracy_score(y_test, y_pred_svc)
ac

# number of components chosen for pca does effect accuracy a little, for example:
# ac = 0.6651162790697674 for n_components = 3
# ac = 0.6604651162790698 for n_components = 50

0.6651162790697674

In [76]:
# Random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf_rfc = RandomForestClassifier(max_depth=8, random_state=0)
clf_rfc.fit(X_train, y_train)
y_pred_rfc = clf_rfc.predict(X_test)
ac = metrics.accuracy_score(y_test, y_pred_rfc)
ac

0.6232558139534884

In [77]:
# Logistic regression

from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(X_train, y_train)
y_pred_lr = clf_lr.predict(X_test)
ac = metrics.accuracy_score(y_test, y_pred_lr)
ac

0.6325581395348837