In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# read data
df = pd.read_csv("data/npf_train.csv")

In [3]:
df.head()

Unnamed: 0,id,date,class4,partlybad,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,1,2000-02-23,nonevent,False,380.52812,0.802001,380.371466,0.88955,381.816207,1.292593,...,-10.730843,1.381815,-10.282754,1.870056,8.356761,4.534937,0.178084,0.123402,0.002546,0.000686
1,2,2000-03-25,Ib,False,373.128684,1.096617,372.98,1.04775,373.70183,1.259198,...,-2.095641,1.695622,-1.095864,2.090111,12.906779,7.0223,0.333523,0.239981,0.000662,0.00021
2,3,2000-04-06,Ib,False,372.363293,0.626329,372.245689,0.615803,372.847246,0.647279,...,0.991521,1.914186,1.846503,1.954748,14.286261,9.572444,0.418313,0.344386,0.000541,7.2e-05
3,4,2000-04-11,nonevent,False,381.437442,7.281159,381.380405,7.236002,381.926532,7.294374,...,1.753414,0.340565,2.524931,0.414255,4.945162,3.405652,0.224159,0.192014,0.00371,0.001209
4,5,2000-04-23,II,False,375.42631,3.264246,375.436524,3.110886,375.740215,3.274924,...,10.940107,2.179821,11.441893,3.048699,13.087014,9.771415,0.525591,0.476821,0.00368,0.00216


In [6]:
# delete some usuless columns
del df["date"]
del df["id"]
del df["partlybad"]
df.head()

Unnamed: 0,class4,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,nonevent,380.52812,0.802001,380.371466,0.88955,381.816207,1.292593,380.296466,0.968884,236.605353,...,-10.730843,1.381815,-10.282754,1.870056,8.356761,4.534937,0.178084,0.123402,0.002546,0.000686
1,Ib,373.128684,1.096617,372.98,1.04775,373.70183,1.259198,372.91,1.004164,252.480327,...,-2.095641,1.695622,-1.095864,2.090111,12.906779,7.0223,0.333523,0.239981,0.000662,0.00021
2,Ib,372.363293,0.626329,372.245689,0.615803,372.847246,0.647279,372.193952,0.596289,269.981547,...,0.991521,1.914186,1.846503,1.954748,14.286261,9.572444,0.418313,0.344386,0.000541,7.2e-05
3,nonevent,381.437442,7.281159,381.380405,7.236002,381.926532,7.294374,381.381156,7.208287,68.364653,...,1.753414,0.340565,2.524931,0.414255,4.945162,3.405652,0.224159,0.192014,0.00371,0.001209
4,II,375.42631,3.264246,375.436524,3.110886,375.740215,3.274924,375.337059,2.90378,242.192619,...,10.940107,2.179821,11.441893,3.048699,13.087014,9.771415,0.525591,0.476821,0.00368,0.00216


In [4]:
df['class4'].describe()

count          430
unique           4
top       nonevent
freq           215
Name: class4, dtype: object

In [5]:
df.class4.unique()

array(['nonevent', 'Ib', 'II', 'Ia'], dtype=object)

In [169]:
# select X and y variables

X = df.loc[:, df.columns != 'class4']
y = df['class4']
y = y.replace(['nonevent', 'Ia', 'Ib', 'II'],[0, 1, 2, 3])

# Feature selection

## By hand

In [170]:
def hand(X):
    # indexes to be deleted
    delidx=[]
    # loop column names
    for i,col in enumerate(X.columns):
        # Drop all .std - columns 
        if col[-4:]==".std":
            delidx.append(i)

        # Drop different altitude columns
        if col[:3]=="CO2" and col!="CO2168.mean":
            delidx.append(i)
        if col[:3]=="H20" and col != "H20168.mean":
            delidx.append(i)        
        if col[:2] == "NO" and col != "NO168.mean" and col[:3] != "NOx":
            delidx.append(i)        
        if col[:3] == "NOx" and col != "NOx168.mean":
            delidx.append(i)
        if col[:2] == "O3" and col != "O3168.mean":
            delidx.append(i)
        if col[:6] == "RHIRGA" and col != "RHIRHGA168.mean":
            delidx.append(i)
        if col[:1] == "T" and col != "T168.mean":
            delidx.append(i)
    X = X.drop(X.columns[delidx], axis=1)

    assert(len(X.columns) == 23)
    
    return X

## PCA

In [171]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def pca(X):
    # standarsize the data - PCA is affected by scale
    scaler = StandardScaler()
    # fit on training only
    scaler.fit(X)
    X = scaler.transform(X)

    # Feature selection using PCA
    pca = PCA(n_components=23, svd_solver='auto', tol=0.0, iterated_power='auto', random_state=None)
    # fit only on training data
    pca.fit(X)
    # Apply dimensionality reduction to X
    X = pca.transform(X)
    #print("Number of features: {}".format(len(pca.components_)))
    #print('Variance ratio: \n', pca.explained_variance_ratio_)
    return X

# Model selection

In [172]:
# KFold cross-validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

def modeling(model, X):
    cv = KFold(n_splits=10, shuffle=True)
    model = model

    # evaluate model
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    # report performance
    print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [184]:
print('feature selection: by hand')
x = hand(X)
modeling(LogisticRegression(),x)
modeling(SVC(),x)
modeling(RandomForestClassifier(),x)
modeling(DecisionTreeClassifier(),x)

print('\n')

print('feature selection: pca')
x = pca(X)
modeling(LogisticRegression(),x)
modeling(SVC(),x)
modeling(RandomForestClassifier(),x)
modeling(DecisionTreeClassifier(),x)

print('\n')

print('feature selection: both')
x = pca(hand(X))
modeling(LogisticRegression(),x)
modeling(SVC(),x)
modeling(RandomForestClassifier(),x)
modeling(DecisionTreeClassifier(),x)
print('\n')

#modeling(RandomForestClassifier(n_estimators=50, min_samples_leaf=5, criterion='gini'),x)

feature selection: by hand
Accuracy: 0.626 (0.077)
Accuracy: 0.558 (0.061)
Accuracy: 0.695 (0.061)
Accuracy: 0.637 (0.065)


feature selection: pca
Accuracy: 0.644 (0.055)
Accuracy: 0.667 (0.065)
Accuracy: 0.651 (0.076)
Accuracy: 0.519 (0.071)


feature selection: both
Accuracy: 0.679 (0.061)
Accuracy: 0.670 (0.067)
Accuracy: 0.688 (0.062)
Accuracy: 0.567 (0.057)




In [182]:
# Split to training and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size=0.25, random_state=0)