# Import Library

In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load dataset

In [2]:
df_new = pd.read_csv("correct_data.csv")
df_new = df_new.sample(frac=1).reset_index(drop=True).drop_duplicates().dropna()
df_new

Unnamed: 0,F1,R1,R2,RL,F1_2,R1_2,R2_2,RL_2,labels
0,0.666667,0.813559,0.655172,0.661017,0.400000,0.613636,0.302326,0.272727,1
1,0.219512,0.373984,0.115702,0.243902,0.400000,0.461538,0.378378,0.410256,1
2,0.196721,0.475248,0.161616,0.277228,0.243902,0.515152,0.218750,0.303030,1
3,0.349206,0.593407,0.292135,0.307692,0.350000,0.385965,0.327273,0.350877,1
4,0.370370,0.674157,0.321839,0.471910,0.121212,0.370370,0.115385,0.296296,1
...,...,...,...,...,...,...,...,...,...
7558,0.350000,0.687500,0.333333,0.453125,0.478261,0.567568,0.472222,0.513514,1
7559,0.637681,0.733945,0.560748,0.642202,0.367347,0.493506,0.320000,0.389610,1
7560,0.323529,0.519231,0.254902,0.365385,0.368421,0.483871,0.133333,0.258065,1
7561,0.297872,0.542857,0.176471,0.371429,0.500000,0.530612,0.425532,0.408163,1


# Train, test split

In [3]:
X = df_new.drop(columns=['labels'])
y = df_new['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Find best model

In [4]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [5]:
param_grid = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }
}

In [6]:
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_

for model_name, best_model in best_models.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))

Running GridSearchCV for Logistic Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Running GridSearchCV for Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Running GridSearchCV for SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running GridSearchCV for K-Nearest Neighbors...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Running GridSearchCV for Gradient Boosting...
Fitting 5 folds for each of 54 candidates, totalling 270 fits

Evaluating Logistic Regression...
              precision    recall  f1-score   support

           0       0.67      0.01      0.02       164
           1       0.79      1.00      0.88       593

    accuracy                           0.78       757
   macro avg       0.73      0.51      0.45       757
weighted avg       0.76      0.78      0.69       757


Evaluating Random Forest...
              precision    recall  f1-score   support

           0       0.98      0.

# Save model

In [7]:
best_model = best_models["Gradient Boosting"]
joblib.dump(best_model, "correct_model.pkl")
print("Model has been saved as 'correct_model.pkl'")

Model has been saved as 'correct_model.pkl'
