## Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, classification_report
import warnings 
warnings.filterwarnings('ignore')


## Loading dataset

In [2]:
data = pd.read_csv("data_pca_REVISED.csv")

## Rebalancing dataset

In [3]:
data['Popular'].value_counts()

0    1260
1     507
Name: Popular, dtype: int64

## Splitting data to train and test splits

In [8]:
X = data.drop("Popular", axis=1)
y = data["Popular"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Applying grid search for logistic regression to maximize the F1 score

In [9]:
logistic_model = LogisticRegression(random_state=42)
param_grid_lr = {
    'C': [0.0002, 0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['svd', 'liblinear', 'sag']
}

print('Grid search in progress..')
grid_search_lr = GridSearchCV(logistic_model, param_grid=param_grid_lr, cv=5, scoring='f1')
grid_search_lr.fit(X_train, y_train)
print("Best parameters found for Logistic Regression:", grid_search_lr.best_params_)


Grid search in progress..
Best parameters found for Logistic Regression: {'C': 0.0002, 'penalty': 'l2', 'solver': 'liblinear'}


## Evaluating performance of best logistic regression model found

In [10]:
y_pred_lr = grid_search_lr.best_estimator_.predict(X_test)
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
classification_report_lr = classification_report(y_test, y_pred_lr)
print("                 Logistic regression                  ")
print('------------------------------------------------------')
print("Confusion Matrix")
print('------------------------------------------------------')
print(pd.DataFrame(conf_matrix_lr, index=['actual 0', 'actual 1'], columns=['0 predictions', '1 predictions']))
print('------------------------------------------------------')
print("Classification report")
print('------------------------------------------------------')
print(classification_report_lr)

                 Logistic regression                  
------------------------------------------------------
Confusion Matrix
------------------------------------------------------
          0 predictions  1 predictions
actual 0             63             68
actual 1             21             25
------------------------------------------------------
Classification report
------------------------------------------------------
              precision    recall  f1-score   support

           0       0.75      0.48      0.59       131
           1       0.27      0.54      0.36        46

    accuracy                           0.50       177
   macro avg       0.51      0.51      0.47       177
weighted avg       0.62      0.50      0.53       177



## Applying grid search for Random Forest to maximize the F1 score

In [11]:
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = param_dist = {
    'n_estimators': [50,100,200,300],  
    'max_depth': [10,20,30,None],      
    'min_samples_split': list(range(2, 15,3)),     
    'min_samples_leaf': list(range(1, 15,3)),  
}

print('Grid search in progress..')
grid_search_rf = GridSearchCV(rf_model, param_grid=param_grid_rf, cv=5, scoring='f1')
grid_search_rf.fit(X_train, y_train)
print("Best parameters found for Random Forest:", grid_search_rf.best_params_)

Grid search in progress..
Best parameters found for Random Forest: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}


## Evaluating performance of best Random Forest model found

In [12]:
y_pred_rf = grid_search_rf.best_estimator_.predict(X_test)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
classification_report_rf = classification_report(y_test, y_pred_rf)

print('                    Random forest                     ')
print('------------------------------------------------------')
print("Confusion Matrix",)
print('------------------------------------------------------')
print( pd.DataFrame(conf_matrix_rf, index=['actual 0', 'actual 1'], columns=['0 predictions', '1 predictions']))
print('------------------------------------------------------')
print("Classification report")
print('------------------------------------------------------')
print(classification_report_rf)



                    Random forest                     
------------------------------------------------------
Confusion Matrix
------------------------------------------------------
          0 predictions  1 predictions
actual 0            128              3
actual 1             25             21
------------------------------------------------------
Classification report
------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.98      0.90       131
           1       0.88      0.46      0.60        46

    accuracy                           0.84       177
   macro avg       0.86      0.72      0.75       177
weighted avg       0.85      0.84      0.82       177



## Applying randomized seacrch for neural network to maximize F1-score

In [13]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV


nn_model = MLPClassifier(max_iter=100)
param_dist = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['logistic', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.000010  , 0.0005, 0.03, 1.5, 72 ,
       3727, 193069, 10000000, 517947468],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}
print('Randomized search in progress..')
random_search_nn = RandomizedSearchCV(nn_model, param_distributions=param_dist, n_iter=15, cv=5, random_state=42, n_jobs=-1,verbose=2)
random_search_nn.fit(X_train, y_train)
print("Best parameters found for Multilayer Perceptron:", random_search_nn.best_params_)



Randomized search in progress..
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best parameters found for Multilayer Perceptron: {'solver': 'adam', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100,), 'alpha': 0.03, 'activation': 'relu'}


## Evaluating performance of best neural network model found

In [14]:
y_pred_nn = random_search_nn.best_estimator_.predict(X_test)
conf_matrix_nn = confusion_matrix(y_test, y_pred_nn)
classification_report_nn = classification_report(y_test, y_pred_nn)

print('                 Multilayer Perceptron                ')
print('------------------------------------------------------')
print("Confusion Matrix",)
print('------------------------------------------------------')
print( pd.DataFrame(conf_matrix_nn, index=['actual 0', 'actual 1'], columns=['0 predictions', '1 predictions']))
print('------------------------------------------------------')
print("Classification report")
print('------------------------------------------------------')
print(classification_report_nn)



                 Multilayer Perceptron                
------------------------------------------------------
Confusion Matrix
------------------------------------------------------
          0 predictions  1 predictions
actual 0            108             23
actual 1             21             25
------------------------------------------------------
Classification report
------------------------------------------------------
              precision    recall  f1-score   support

           0       0.84      0.82      0.83       131
           1       0.52      0.54      0.53        46

    accuracy                           0.75       177
   macro avg       0.68      0.68      0.68       177
weighted avg       0.75      0.75      0.75       177



## Applying grid seacrch for QDA to maximize F1-score

In [15]:
qda = QuadraticDiscriminantAnalysis()

param_grid_qda = {
    'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'store_covariance': [True, False],
    'priors': [None, [0.4, 0.6], [0.3, 0.7]],
    'tol': [1e-4, 1e-3, 1e-2]
}


print('Grid search in progress..')
grid_search_qda = GridSearchCV(qda, param_grid=param_grid_qda, cv=5, scoring='f1')
grid_search_qda.fit(X_train, y_train)
print("Best parameters found for Quadratic Discriminant Analysis:", grid_search_qda.best_params_)



Grid search in progress..
Best parameters found for Quadratic Discriminant Analysis: {'priors': [0.3, 0.7], 'reg_param': 0.3, 'store_covariance': True, 'tol': 0.0001}


## Evaluating performance of best QDA model found

In [16]:
y_pred_qda = grid_search_qda.best_estimator_.predict(X_test)
conf_matrix_qda = confusion_matrix(y_test, y_pred_qda)
classification_report_qda = classification_report(y_test, y_pred_qda)

print('           Quadratic Discriminant Analysis            ')
print('------------------------------------------------------')
print("Confusion Matrix",)
print('------------------------------------------------------')
print( pd.DataFrame(conf_matrix_qda, index=['actual 0', 'actual 1'], columns=['0 predictions', '1 predictions']))
print('------------------------------------------------------')
print("Classification report")
print('------------------------------------------------------')
print(classification_report_qda)



           Quadratic Discriminant Analysis            
------------------------------------------------------
Confusion Matrix
------------------------------------------------------
          0 predictions  1 predictions
actual 0            115             16
actual 1             25             21
------------------------------------------------------
Classification report
------------------------------------------------------
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       131
           1       0.57      0.46      0.51        46

    accuracy                           0.77       177
   macro avg       0.69      0.67      0.68       177
weighted avg       0.76      0.77      0.76       177

