In [1]:
import pandas as pd

In [2]:
#loading the dataset
df=pd.read_csv('eeg-headset.csv')
df.head()

Unnamed: 0,AF3,F7,F3,FC5,T7,P,O1,O2,P8,T8,FC6,F4,F8,AF4,eye_state
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,1
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,1
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,1
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,1
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,1


In [3]:
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14980 entries, 0 to 14979
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AF3        14980 non-null  float64
 1   F7         14980 non-null  float64
 2   F3         14980 non-null  float64
 3   FC5        14980 non-null  float64
 4   T7         14980 non-null  float64
 5   P          14980 non-null  float64
 6   O1         14980 non-null  float64
 7   O2         14980 non-null  float64
 8   P8         14980 non-null  float64
 9   T8         14980 non-null  float64
 10  FC6        14980 non-null  float64
 11  F4         14980 non-null  float64
 12  F8         14980 non-null  float64
 13  AF4        14980 non-null  float64
 14  eye_state  14980 non-null  int64  
dtypes: float64(14), int64(1)
memory usage: 1.7 MB
None
                 AF3            F7            F3            FC5            T7  \
count   14980.000000  14980.000000  14980.000000   14980.000000  14980.000

In [4]:
# data preprocessing
#check for duplicate rows
duplicate_rows = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

#splitting the dataset into training and testing sets
print(df.shape)
from sklearn.model_selection import train_test_split
X = df.drop('eye_state', axis=1)
y = df['eye_state']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Number of duplicate rows: 0
(14980, 15)


In [5]:
#scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#print head of the scaled training data
# print(X_train_scaled[:10])

In [6]:
#training the model

#using random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)


In [7]:
#pridicting the test set results
y_pred = rf_classifier.predict(X_test_scaled)
#evaluating the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred))

Confusion Matrix:
[[2285  101]
 [ 244 1864]]

Classification Report:
              precision    recall  f1-score   support

           1       0.90      0.96      0.93      2386
           2       0.95      0.88      0.92      2108

    accuracy                           0.92      4494
   macro avg       0.93      0.92      0.92      4494
weighted avg       0.92      0.92      0.92      4494


Accuracy Score:
0.9232309746328438


In [8]:
# Evaluate Random Forest for overfitting
y_train_pred_rf = rf_classifier.predict(X_train_scaled)
train_acc_rf = accuracy_score(y_train, y_train_pred_rf)
test_acc_rf = accuracy_score(y_test, y_pred)
print(f"Random Forest Training Accuracy: {train_acc_rf:.4f}")
print(f"Random Forest Test Accuracy: {test_acc_rf:.4f}")
if train_acc_rf - test_acc_rf > 0.05:
    print("Random Forest shows signs of overfitting.")
else:
    print("Random Forest does not show significant overfitting.")

Random Forest Training Accuracy: 1.0000
Random Forest Test Accuracy: 0.9232
Random Forest shows signs of overfitting.


In [9]:
#training the model using XGBoost Classifier
from xgboost import XGBClassifier

# Remap target values from [1, 2] to [0, 1] :: XGBoost expects binary classification labels to be 0 and 1
y_train_xgb = y_train.replace({1: 0, 2: 1})
y_test_xgb = y_test.replace({1: 0, 2: 1})

xgb_classifier = XGBClassifier(n_estimators=100, random_state=42)
xgb_classifier.fit(X_train_scaled, y_train_xgb)


In [10]:
# Evaluate XGBoost for overfitting
y_train_pred_xgb = xgb_classifier.predict(X_train_scaled)
y_test_pred_xgb = xgb_classifier.predict(X_test_scaled)
train_acc_xgb = accuracy_score(y_train_xgb, y_train_pred_xgb)
test_acc_xgb = accuracy_score(y_test_xgb, y_test_pred_xgb)
print(f"XGBoost Training Accuracy: {train_acc_xgb:.4f}")
print(f"XGBoost Test Accuracy: {test_acc_xgb:.4f}")
if train_acc_xgb - test_acc_xgb > 0.05:
    print("XGBoost shows signs of overfitting.")
else:
    print("XGBoost does not show significant overfitting.")

XGBoost Training Accuracy: 0.9949
XGBoost Test Accuracy: 0.9286
XGBoost shows signs of overfitting.


In [11]:
#predicting the XGBoost model results
y_pred_xgb = xgb_classifier.predict(X_test_scaled)
#evaluating the XGBoost model
print("Confusion Matrix for XGBoost:")
print(confusion_matrix(y_test_xgb, y_pred_xgb))
print("\nClassification Report for XGBoost:")
print(classification_report(y_test_xgb, y_pred_xgb))
print("\nAccuracy Score for XGBoost:")
print(accuracy_score(y_test_xgb, y_pred_xgb))


Confusion Matrix for XGBoost:
[[2259  127]
 [ 194 1914]]

Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93      2386
           1       0.94      0.91      0.92      2108

    accuracy                           0.93      4494
   macro avg       0.93      0.93      0.93      4494
weighted avg       0.93      0.93      0.93      4494


Accuracy Score for XGBoost:
0.9285714285714286


## Hyperparameter Tuning for XGBoost
We will use GridSearchCV to find the best hyperparameters for the XGBoost classifier.

In [12]:
# Hyperparameter tuning for XGBoost using GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb_grid = GridSearchCV(XGBClassifier(random_state=42), param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
xgb_grid.fit(X_train_scaled, y_train_xgb)
print(f"Best parameters: {xgb_grid.best_params_}")
print(f"Best cross-validation accuracy: {xgb_grid.best_score_:.4f}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best cross-validation accuracy: 0.9308


In [13]:
# Train and evaluate XGBoost with best hyperparameters
best_xgb = xgb_grid.best_estimator_
best_xgb.fit(X_train_scaled, y_train_xgb)
y_pred_best_xgb = best_xgb.predict(X_test_scaled)
print("Confusion Matrix (Tuned XGBoost):")
print(confusion_matrix(y_test_xgb, y_pred_best_xgb))
print("\nClassification Report (Tuned XGBoost):")
print(classification_report(y_test_xgb, y_pred_best_xgb))
print("\nAccuracy Score (Tuned XGBoost):")
print(accuracy_score(y_test_xgb, y_pred_best_xgb))

Confusion Matrix (Tuned XGBoost):
[[2295   91]
 [ 151 1957]]

Classification Report (Tuned XGBoost):
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2386
           1       0.96      0.93      0.94      2108

    accuracy                           0.95      4494
   macro avg       0.95      0.95      0.95      4494
weighted avg       0.95      0.95      0.95      4494


Accuracy Score (Tuned XGBoost):
0.9461504227859368


In [14]:
# Calculate and print train and test accuracy for tuned XGBoost
y_train_pred_best_xgb = best_xgb.predict(X_train_scaled)
train_acc_best_xgb = accuracy_score(y_train_xgb, y_train_pred_best_xgb)
test_acc_best_xgb = accuracy_score(y_test_xgb, y_pred_best_xgb)
print(f"Tuned XGBoost Training Accuracy: {train_acc_best_xgb:.4f}")
print(f"Tuned XGBoost Test Accuracy: {test_acc_best_xgb:.4f}")

Tuned XGBoost Training Accuracy: 1.0000
Tuned XGBoost Test Accuracy: 0.9462


#### overfitting became high even when the test accuracy increased due to training accuracy 1. so the model memorized training set. 

## Hyperparameter Tuning with Regularization
We will tune XGBoost again, focusing on regularization parameters to reduce overfitting.

In [15]:
# Grid search with regularization to reduce overfitting
param_grid_reg = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [1, 2, 5]
}
xgb_grid_reg = GridSearchCV(XGBClassifier(random_state=42), param_grid_reg, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
xgb_grid_reg.fit(X_train_scaled, y_train_xgb)
print(f"Best parameters (with regularization): {xgb_grid_reg.best_params_}")
print(f"Best cross-validation accuracy: {xgb_grid_reg.best_score_:.4f}")

Fitting 3 folds for each of 288 candidates, totalling 864 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters (with regularization): {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.8}
Best cross-validation accuracy: 0.8752


In [16]:
# Train and evaluate XGBoost with best regularized hyperparameters
best_xgb_reg = xgb_grid_reg.best_estimator_
best_xgb_reg.fit(X_train_scaled, y_train_xgb)
y_pred_best_xgb_reg = best_xgb_reg.predict(X_test_scaled)
print("Confusion Matrix (Regularized XGBoost):")
print(confusion_matrix(y_test_xgb, y_pred_best_xgb_reg))
print("\nClassification Report (Regularized XGBoost):")
print(classification_report(y_test_xgb, y_pred_best_xgb_reg))
print("\nAccuracy Score (Regularized XGBoost):")
print(accuracy_score(y_test_xgb, y_pred_best_xgb_reg))
y_train_pred_best_xgb_reg = best_xgb_reg.predict(X_train_scaled)
train_acc_best_xgb_reg = accuracy_score(y_train_xgb, y_train_pred_best_xgb_reg)
test_acc_best_xgb_reg = accuracy_score(y_test_xgb, y_pred_best_xgb_reg)
print(f"Regularized XGBoost Training Accuracy: {train_acc_best_xgb_reg:.4f}")
print(f"Regularized XGBoost Test Accuracy: {test_acc_best_xgb_reg:.4f}")

Confusion Matrix (Regularized XGBoost):
[[2188  198]
 [ 371 1737]]

Classification Report (Regularized XGBoost):
              precision    recall  f1-score   support

           0       0.86      0.92      0.88      2386
           1       0.90      0.82      0.86      2108

    accuracy                           0.87      4494
   macro avg       0.88      0.87      0.87      4494
weighted avg       0.88      0.87      0.87      4494


Accuracy Score (Regularized XGBoost):
0.8733867378727191
Regularized XGBoost Training Accuracy: 0.9173
Regularized XGBoost Test Accuracy: 0.8734
