In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score, precision_score, recall_score, f1_score ,roc_auc_score, roc_curve
import joblib
from sklearn.svm import SVC
import matplotlib.pyplot as plt

## Loading the DataSet

In [12]:
preprocessed_df=pd.read_csv('preprocessed_data.csv')
preprocessed_df.shape

(40420, 313)

In [13]:
preprocessed_df

Unnamed: 0,rating,review_length,vector_0,vector_1,vector_2,vector_3,vector_4,vector_5,vector_6,vector_7,...,category_Clothing_Shoes_and_Jewelry_5,category_Electronics_5,category_Home_and_Kitchen_5,category_Kindle_Store_5,category_Movies_and_TV_5,category_Pet_Supplies_5,category_Sports_and_Outdoors_5,category_Tools_and_Home_Improvement_5,category_Toys_and_Games_5,label_encoded
0,5.0,12,-0.582917,0.544354,-0.216788,-0.551188,0.532213,-0.943284,0.577438,0.982709,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,5.0,16,-0.524547,0.238297,-0.316288,-0.416228,0.303663,-0.386863,0.082068,0.788470,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,5.0,14,-0.299110,0.280882,-0.297741,-0.278253,0.180033,-0.661248,0.634481,0.758312,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1.0,17,-0.817822,0.431277,-0.308483,-0.517900,0.364205,-0.314387,0.009654,0.944113,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,5.0,18,-0.617705,0.427575,-0.508889,-0.797756,0.602775,-0.263422,0.121937,1.065809,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40415,4.0,329,-0.308856,0.256705,-0.304249,-0.132848,0.005212,-0.022169,0.250727,0.593539,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
40416,5.0,270,-0.534267,0.519203,-0.480290,-0.440216,0.249422,-0.504126,0.252005,0.708434,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
40417,2.0,343,-0.278028,0.139923,-0.267886,-0.172448,0.021288,-0.186904,0.269859,0.542738,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
40418,1.0,278,-0.490687,0.529655,-0.508148,-0.494471,0.236855,-0.488244,0.413194,0.691775,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


##  Spliting the dataset into training and testing sets

In [14]:
X = preprocessed_df.drop('label_encoded', axis=1) 
y = preprocessed_df['label_encoded']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Training models such as Random Forest, SVM, and Logistic Regression 


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


pipelines = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42,probability=True),
    'Logistic Regression': LogisticRegression(random_state=42)
}


## Evaluating each model’s performance

Saving each trained model using joblib

In [17]:
best_model = None
best_performance = None

for model_name, model in pipelines.items():
    print(f"\nTraining and evaluating {model_name}...")
    
    model.fit(X_train_pca, y_train)
    
    y_pred = model.predict(X_test_pca)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    model_filename = f"{model_name.replace(' ', '_')}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")
    




Training and evaluating Random Forest...

Accuracy: 0.8409
Precision: 0.8450
Recall: 0.8409
F1 Score: 0.8404

Confusion Matrix:
[[3165  864]
 [ 422 3633]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      4029
           1       0.81      0.90      0.85      4055

    accuracy                           0.84      8084
   macro avg       0.85      0.84      0.84      8084
weighted avg       0.84      0.84      0.84      8084

Model saved as Random_Forest_model.pkl

Training and evaluating SVM...

Accuracy: 0.8845
Precision: 0.8846
Recall: 0.8845
F1 Score: 0.8844

Confusion Matrix:
[[3523  506]
 [ 428 3627]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      4029
           1       0.88      0.89      0.89      4055

    accuracy                           0.88      8084
   macro avg       0.88      0.88      0.88      8084
weighted avg

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Loading Models

In [18]:
random_forest_model = joblib.load('Random_Forest_model.pkl')
svm_model = joblib.load('SVM_model.pkl')
logistic_regression_model = joblib.load('Logistic_Regression_model.pkl')


rf_predictions = random_forest_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)
logistic_predictions = logistic_regression_model.predict(X_test)





## Test Predictions

In [25]:
comparison_df_SVM = pd.DataFrame({
    'Actual': y_test[:10],
    'Predicted by SVM': svm_predictions[:10]
})
print("SVM Model Predictions:")
comparison_df_SVM



SVM Model Predictions:


Unnamed: 0,Actual,Predicted by SVM
5553,1,0
21921,0,1
33592,1,0
20656,1,0
28904,0,0
22251,0,1
32289,1,1
34186,1,0
7948,1,1
30021,0,1
