## Load data

In [1]:
import pandas as pd
import glob, os
pathdir = "./files_batch_1,2/"
files = glob.glob(os.path.join(pathdir, "*/*"))

train = []
test  = []
for i in files:
    if 'files_bfusmalljar_' in i:
        train.append(i)
    else:
        test.append(i)
print('There are {} and {} files for training and testing, respectively.'.format(len(train), len(test)))

There are 30 and 30 files for training and testing, respectively.


## Extract the exposure part of sample
#### P.S., In batch2, the 'water' folder is considered as 'humidity' to be consistent with batch1.

In [2]:
import re
read_df_sample = pd.read_csv(train[0])
features = [i for i in read_df_sample if re.findall("\As\d", i)]
features.append('humidity')
features.append('label')

def load_data(data):
    all_batches = []
    for i in data:
        batch = pd.read_csv(i)

        batch = batch.loc[batch['trial_state'] == 'exposure']

        try:
            batch['label']
        except:
            batch = batch.assign(label=['humidity' for _ in range(len(batch))])

        batch = batch[features] # Thanks Pegah :)
        all_batches.append(batch)
    return all_batches

train, test = pd.concat(load_data(train)), pd.concat(load_data(test))
print('Train and test data are loaded and combined with length {} and {}, respectively, i.e., (30 * 90 rows as the exposure timestamp for each sample).'.format(len(train), len(test)))

Train and test data are loaded and combined with length 2700 and 2700, respectively, i.e., (30 * 90 rows as the exposure timestamp for each sample).


## Encoding labels

In [3]:
print(train['label'].value_counts(dropna=False),'\n\n',test['label'].value_counts(dropna=False))

fresh       900
humidity    900
moldy       900
Name: label, dtype: int64 

 fresh       990
moldy       900
humidity    810
Name: label, dtype: int64


In [4]:
labels = train['label'].unique().tolist()
labs = {}
count = 0
for lble in labels:
    labs[lble] = labs.get(lble,0)+count
    count += 1
labs

{'fresh': 0, 'humidity': 1, 'moldy': 2}

In [5]:
def label_encoder(data):
    for i in labs:
        data.loc[data['label'] == i, 'label'] = labs[i]
    return data

train, test = label_encoder(train), label_encoder(test)

## Feature Normalization

In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler

label_train, label_test = np.asarray(train['label']), np.asarray(test['label'])
features_train, features_test = np.asarray(train.drop(columns='label')), np.asarray(test.drop(columns='label'))

# norm = StandardScaler().fit(features_train)
norm = MinMaxScaler().fit(features_train)

features_train, features_test = norm.transform(features_train), norm.transform(features_test)

In [7]:
num_samples = len(features_train)//90
def feat_ext(data, label):
    labels = []
    all_data = []
    for sn in range(num_samples):
        tmp_sample = data[sn*90:(sn+1)*90,:]
        all_data.append(tmp_sample.max(axis=0))
        labels.append(label[sn*90+1])
    return all_data, labels

features_train, label_train = feat_ext(features_train, label_train)
features_test, label_test = feat_ext(features_test, label_test)

In [8]:
from sklearn.utils import shuffle
def shufflee(data,labels):
    df = pd.DataFrame(data, columns = [str(i) for i in range(np.shape(data)[1])])
    df = df.assign(label=labels)
    df = shuffle(df)
    labelss = np.asarray(df['label'])
    featuress = np.asarray(df.drop(columns='label'))
    return featuress, labelss

features_train, label_train = shufflee(features_train, label_train)
features_test, label_test   = shufflee(features_test, label_test)

## Classififcation based on normalized features

In [9]:
from sklearn.svm import LinearSVC
from sklearn import metrics

LSVM = LinearSVC(random_state=0, penalty='l2')
LSVM.fit(features_train, label_train)
predicted_LSVM = LSVM.predict(features_test)

print('The accuracy score: {:.3f}'.format(metrics.accuracy_score(label_test, predicted_LSVM)))

The accuracy score: 0.333


In [10]:
from sklearn.neighbors import KNeighborsClassifier 

KNN = KNeighborsClassifier(n_neighbors=2)
KNN.fit(features_train, label_train)
predicted_KNN = KNN.predict(features_test)

print('The accuracy score: {:.3f}'.format(metrics.accuracy_score(label_test, predicted_KNN)))

The accuracy score: 0.367


## Picking the model for deployment

In [11]:
import pickle
pickle.dump(KNN, open('knnModel.pkl', 'wb'))