In [1]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Load the dataset
file_path = 'gas_sensor_data.csv'
data = pd.read_csv(file_path)

# Data Preprocessing

The following code snippet is responsible for handling missing values in the dataset. First, it checks for any missing values in each column using the isnull().sum() method, which returns the count of missing values for each column. The results are printed to provide an overview of the missing data. To address the missing values, the code fills them with the mean of their respective columns using the fillna() method. The numeric_only=True parameter ensures that only numeric columns are considered when calculating the mean, thereby preventing errors and ensuring that the dataset is clean and ready for further analysis.

In [None]:
# Check for missing values
missing_values = data.isnull().sum()

# Print missing values
print("Missing values in each column:")
print(missing_values)

# Fill missing values with the mean of the columns
data_cleaned = data.fillna(data.mean(numeric_only=True))

The following code snippet is responsible for analyzing and handling class imbalance in the dataset. Initially, it calculates the class distribution by counting the occurrences of each class label using the value_counts() method. The results are printed to provide an overview of the class distribution before handling the imbalance. To address the imbalance, the code identifies the majority class and the minority classes. It then resamples the minority classes using the resample() function from sklearn.utils, with replacement, to match the number of instances in the majority class. The resampled minority class data is appended to a list, which is then concatenated to form a balanced dataset. The balanced dataset is shuffled to ensure randomness and reset the index. Finally, the code prints the class distribution after handling the imbalance to verify the changes. This process ensures that the dataset is balanced and ready for further analysis or model training.

In [None]:
# Analyze class distribution
class_column = 'gas_label'
class_distribution = data_cleaned[class_column].value_counts()
print("Class distribution before handling imbalance:")
print(class_distribution)

# Handle class imbalance
# Separate majority and minority classes
majority_class = class_distribution.idxmax()
minority_classes = class_distribution[class_distribution != class_distribution.max()].index

# Resample minority classes
resampled_data = [data_cleaned[data_cleaned[class_column] == majority_class]]
for minority_class in minority_classes:
    minority_data = data_cleaned[data_cleaned[class_column] == minority_class]
    resampled_minority_data = resample(minority_data, 
                                       replace=True,     # sample with replacement
                                       n_samples=class_distribution[majority_class],    # to match majority class
                                       random_state=42)  # reproducible results
    resampled_data.append(resampled_minority_data)

# Combine resampled data
balanced_data = pd.concat(resampled_data)

# Shuffle the data
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Analyze class distribution after handling imbalance
balanced_class_distribution = balanced_data[class_column].value_counts()
print("Class distribution after handling imbalance:")
print(balanced_class_distribution)

The following code snippet is responsible for splitting the dataset into features and the target variable, followed by splitting the data into training and testing sets, and standardizing the features. Initially, the dataset is divided into features (X) and the target variable (y) by dropping the target column from the dataset. The data is then split into training and testing sets using an 80/20 split with the train_test_split function from sklearn.model_selection, ensuring that the class distribution is maintained in both sets by using the stratify parameter. Finally, the features are standardized using the StandardScaler from sklearn.preprocessing. The scaler is fitted on the training data and then applied to both the training and testing data to ensure that the features have a mean of 0 and a standard deviation of 1, which is essential for many machine learning algorithms to perform optimally.

In [5]:
# Split the dataset into features and target variable
X = balanced_data.drop(columns=[class_column])
y = balanced_data[class_column]

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

The following code snippet is responsible for generating a correlation matrix to identify relationships between features and selecting the most relevant features based on their correlation. Initially, it creates a correlation matrix for the standardized training features using the corr() method from pandas. The correlation matrix is printed to provide an overview of the relationships between the features. To select the most relevant features, the code identifies features with a high correlation (absolute value greater than 0.5) to the target variable. These relevant features are then extracted from both the training and testing datasets, resulting in new datasets (X_train_relevant and X_test_relevant) that contain only the selected features. This process helps in reducing the dimensionality of the data and retaining only the most informative features for further analysis or model training.

In [None]:
# Generate correlation matrix
correlation_matrix = pd.DataFrame(X_train_scaled, columns=X.columns).corr()
print("Correlation matrix:")
print(correlation_matrix)

# Select features with high correlation to the target variable
relevant_features = correlation_matrix.columns[correlation_matrix.abs().max() > 0.5]
X_train_relevant = pd.DataFrame(X_train_scaled, columns=X.columns)[relevant_features]
X_test_relevant = pd.DataFrame(X_test_scaled, columns=X.columns)[relevant_features]

# Training

The following code snippet is responsible for training SVM classifiers with both linear and RBF kernels, followed by hyperparameter tuning using GridSearchCV. Initially, it trains an SVM classifier with a linear kernel using the SVC class from sklearn.svm and fits it to the relevant training features (X_train_relevant) and target variable (y_train). Similarly, it trains an SVM classifier with an RBF kernel. To optimize the hyperparameters, the code uses GridSearchCV from sklearn.model_selection. For the linear kernel, it searches over a grid of C values ([0.1, 1, 10, 100]) to find the best regularization parameter. For the RBF kernel, it searches over a grid of C values ([0.1, 1, 10, 100]) and gamma values ([1, 0.1, 0.01, 0.001]) to find the best combination of regularization and kernel parameters. The GridSearchCV performs cross-validation (cv=5) to evaluate the performance of each parameter combination and selects the best model based on the cross-validation results. This process ensures that the SVM classifiers are well-tuned and optimized for the given dataset.

In [None]:
# Train SVM classifier with linear kernel
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_relevant, y_train)

# Train SVM classifier with RBF kernel
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train_relevant, y_train)

# Hyperparameter tuning using GridSearchCV for linear kernel
param_grid_linear = {'C': [0.1, 1, 10, 100]}
grid_search_linear = GridSearchCV(SVC(kernel='linear'), param_grid_linear, cv=5)
grid_search_linear.fit(X_train_relevant, y_train)

# Hyperparameter tuning using GridSearchCV for RBF kernel
param_grid_rbf = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001]}
grid_search_rbf = GridSearchCV(SVC(kernel='rbf'), param_grid_rbf, cv=5)
grid_search_rbf.fit(X_train_relevant, y_train)

# Evaluation

The following code snippet is responsible for evaluating the performance of SVM classifiers with both linear and RBF kernels. It first uses the best estimators from the hyperparameter tuning process to make predictions on the test set. The accuracy of each model is then calculated using the accuracy_score function from sklearn.metrics. To provide a detailed evaluation, the code generates confusion matrices for both models using the confusion_matrix function. Additionally, it generates classification reports using the classification_report function, which includes precision, recall, and F1-score for each class. The results are printed to provide a comprehensive evaluation of the models' performance, including accuracy, confusion matrices, and classification reports for both the linear and RBF kernel SVM classifiers. This detailed evaluation helps in understanding the strengths and weaknesses of each model and their effectiveness in classifying the gas types.

In [None]:
# Evaluate the models on the test set
y_pred_linear = grid_search_linear.best_estimator_.predict(X_test_relevant)
y_pred_rbf = grid_search_rbf.best_estimator_.predict(X_test_relevant)

# Calculate accuracy
accuracy_linear = accuracy_score(y_test, y_pred_linear)
accuracy_rbf = accuracy_score(y_test, y_pred_rbf)

# Generate confusion matrix
conf_matrix_linear = confusion_matrix(y_test, y_pred_linear)
conf_matrix_rbf = confusion_matrix(y_test, y_pred_rbf)

# Generate classification report
class_report_linear = classification_report(y_test, y_pred_linear)
class_report_rbf = classification_report(y_test, y_pred_rbf)

print("Accuracy for SVM with linear kernel:", accuracy_linear)
print("Confusion matrix for SVM with linear kernel:\n", conf_matrix_linear)
print("Classification report for SVM with linear kernel:\n", class_report_linear)

print("Accuracy for SVM with RBF kernel:", accuracy_rbf)
print("Confusion matrix for SVM with RBF kernel:\n", conf_matrix_rbf)
print("Classification report for SVM with RBF kernel:\n", class_report_rbf)

print("Training and evaluation of SVM classifiers completed.")