# Import Library

In [8]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Load dataset

In [9]:
df1 = pd.read_csv("unrelated.csv")
df2 = pd.read_csv("related.csv")
df_new = pd.concat([df1, df2])
df_new = df_new.sample(frac=1).reset_index(drop=True).drop_duplicates().dropna()
df_new

Unnamed: 0,F1,R1,R2,RL,labels
0,0.333333,0.594595,0.285714,0.486486,0
1,0.260870,0.514286,0.121212,0.285714,0
2,0.258065,0.612245,0.255319,0.408163,0
3,0.173913,0.411765,0.062500,0.235294,0
4,0.642857,0.681818,0.523810,0.590909,1
...,...,...,...,...,...
10310,0.700000,0.709677,0.689655,0.709677,1
10311,0.400000,0.654545,0.377358,0.472727,1
10312,0.444444,0.690909,0.452830,0.363636,1
10313,0.594595,0.733333,0.517241,0.600000,1


# Train, test split

In [10]:
X = df_new.drop(columns=['labels'])
y = df_new['labels']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Find best model

In [11]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

In [12]:
param_grid = {
    "Logistic Regression": {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear']
    },
    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    },
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    "Gradient Boosting": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.5],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }
}

In [13]:
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_models[model_name] = grid_search.best_estimator_

for model_name, best_model in best_models.items():
    print(f"\nEvaluating {model_name}...")
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))

Running GridSearchCV for Logistic Regression...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Running GridSearchCV for Random Forest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Running GridSearchCV for SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running GridSearchCV for K-Nearest Neighbors...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Running GridSearchCV for Gradient Boosting...
Fitting 5 folds for each of 54 candidates, totalling 270 fits

Evaluating Logistic Regression...
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       495
           1       0.80      0.78      0.79       534

    accuracy                           0.79      1029
   macro avg       0.79      0.79      0.79      1029
weighted avg       0.79      0.79      0.79      1029


Evaluating Random Forest...
              precision    recall  f1-score   support

           0       0.75      0.

# Save model

In [15]:
best_model = best_models["Logistic Regression"]
joblib.dump(best_model, "unrelated_model.pkl")
print("Model has been saved as 'unrelated_model.pkl'")

Model has been saved as 'unrelated_model.pkl'
