In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from feature_engine.selection import DropCorrelatedFeatures
from sklearn.model_selection import GridSearchCV

In [2]:
!pip install feature_engine



In [3]:
relative_path = os.path.join('gas_sensor_data.csv')
df = pd.read_csv(relative_path)

Split data into x and y for easier preprocessing

In [4]:
#Split dataframe into X, Y
y = pd.DataFrame(df['gas_label'])
y_columns = y.columns
x_temp = df.drop(['gas_label'], axis = 1)
x = x_temp.drop(['Batch ID'], axis = 1)
x_columns = x.columns

# Preprocessing
First, I converted x and y into numpy arrays since most of the preprocessing functions do not work with dataframes

In [5]:
#Smote class resampling
y_np = np.array(y)
y_np = y_np.ravel()
x_np = np.array(x)

Then handle missing values with Scikit's Simple Imputer. Missing values are replaced with mean.

In [6]:
#Handle Missing Values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
x_imputed = imp.fit_transform(x_np)

Then encode the Y values with a label encoder. Every class will take on an integer value. However, their order and ratio is meaningless.

In [7]:
#Label Encode Y Values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_np)

# Address Class Imbalance
Then use scikit's SMOTE to address class imbalance by adding synthetic datapoints. These synthetic datapoints should have the same mean so the number of class datapoints will change but the distribution's mean should not.

In [8]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x_np, y_encoded)

Train test split. 2/3 in train and 1/3 in test.

In [9]:
#Split
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=.33, random_state=42)

Standardize data via Z score. Leave y_labels intact since standardizing non-ordinal data is meaningless.

In [10]:
#Standardize
scalar = StandardScaler()
x_train_std = scalar.fit_transform(x_train)
x_test_std = scalar.fit_transform(x_test)

# Address Correlated Features
Address feature complexity and collinearity by dropping features that are highly correlated.
I empircally found that .7 threshold for correlation led to the greatest overall testing accuracy.

In [11]:
#Create correlation matrix
df_corr = pd.DataFrame(x_train_std, columns = x_columns)
df_test = pd.DataFrame(x_test_std, columns = x_columns)
drop_correlated = DropCorrelatedFeatures(threshold=0.7)

In [12]:
drop_correlated.fit(df_corr)
final_df = drop_correlated.transform(df_corr)
final_test = drop_correlated.transform(df_test)

In [13]:
x_train_model = np.array(final_df)
x_test_model = np.array(final_test)

# Train/Test SVM Models
Grid Search Parameter Exploration

In [14]:
param_grid = {
        'C': [0.01, 0.1, 1, 10, 100],
        'max_iter': [-1, 100, 1000, 5000],
        'class_weight': ['balanced', None],
}

SVM trained with results for grid search. Use ovr ("One Versus Rest") for multi-class classification. Gridsearch commented because I already know the results and it takes too much time.

In [15]:
"""
grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=5)
grid_search.fit(x_train_std, y_train)
print("Best parameters:", grid_search.best_params_)

"""
#Linear SVM
linear_svm = svm.SVC(C=100, class_weight='balanced', kernel='linear', decision_function_shape='ovr')
linear_svm.fit(x_train_model, y_train)
y_pred = linear_svm.predict(x_test_model)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.41      0.51       968
           1       0.67      0.93      0.78       995
           2       0.88      0.97      0.92       999
           3       0.78      0.80      0.79       976
           4       0.99      0.93      0.96      1021
           5       0.88      0.79      0.83       999

    accuracy                           0.81      5958
   macro avg       0.81      0.81      0.80      5958
weighted avg       0.81      0.81      0.80      5958



In [17]:
"""
param_grid_rbf = {
        'C': [0.1, 1, 10, 100],
        'gamma': [.001, .01, .1, 1, 10]
}

grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train_std, y_train)
print("Best parameters: ", grid_search.best_params_)
"""
#RBF SVM
nonlinear_svm = svm.SVC(kernel='rbf', C=100, class_weight='balanced')
nonlinear_svm.fit(x_train_model, y_train)

In [18]:
y_pred_rbf = nonlinear_svm.predict(x_test_model)

In [19]:
print(classification_report(y_test, y_pred_rbf))

              precision    recall  f1-score   support

           0       0.87      0.73      0.79       968
           1       0.74      0.99      0.85       995
           2       0.98      0.94      0.96       999
           3       0.97      0.88      0.92       976
           4       0.94      0.99      0.97      1021
           5       0.91      0.82      0.86       999

    accuracy                           0.89      5958
   macro avg       0.90      0.89      0.89      5958
weighted avg       0.90      0.89      0.89      5958

