In [1]:
import pandas as pd
import psycopg2
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
from sklearn.model_selection import GridSearchCV
matplotlib.use('TkAgg')

In [2]:
#load the data fronm a csv

df = pd.read_csv('all_products_reviews.csv')
user_rating = df['user_rating']
df = df.drop(columns = ['locale', 'location', 'recommended_ratio', 'overall_product_rating', 'unique_review_id', 'review_link_id'
                       , 'inexpensive_review', 'shade_range_review', 'pilling_review', 'expensive_review', 'helpful_score' ])
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

columns_to_scale = ['professional_review', 'vibe_review', 'redness_review', 'dry_review', 'light_coverage_review', 
                   'young_review', 'mother_review', 'skin_concerns_review', 'white_review', 'tan_review', 'acne_review',
                   'black_review', 'comfortable_wear_review', 'medium_coverage_review', 
                    'full_coverage_review', 'easy_use_review']
scaler = MinMaxScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Extract unique IDs
unique_ids = df['product_link_id'].unique()

# Randomly split unique IDs into training and testing sets
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)

#df.loc[df['user_rating'] != 5, 'user_rating'] = 0
#df.loc[df['user_rating'] == 5, 'user_rating'] = 1

# Filter the DataFrame based on these IDs
df_train = df[df['product_link_id'].isin(train_ids)]
df_test = df[df['product_link_id'].isin(test_ids)]
#print(df_train['user_rating'])
max_count = df_train['user_rating'].value_counts().max()
min_count = min(df_train['user_rating'].value_counts().min(), max_count)
df_train = df_train.groupby('user_rating').apply(lambda x: x.sample(min_count * 3, replace=True)).reset_index(drop=True)
df_train = df_train.sort_values(by='user_rating', ascending=True)
# Drop the product_link_id column if not needed for training
X_train = df_train.drop(columns=['product_link_id', 'user_rating'])
y_train = df_train['user_rating']

#Test Data
X_test = df_test.drop(columns=['product_link_id', 'user_rating'])
y_test = df_test['user_rating']

print("X_train Length:", len(X_train))
print("y_train Length:", len(y_train))
print("X_test Length:", len(X_test))
print("y_test Length:", len(y_test))

'''
# Creating the X and Y columns 
X = df.drop(columns='user_rating')  # Features
y = df['user_rating']  # Target variable
'''



X_train Length: 114165
y_train Length: 114165
X_test Length: 31465
y_test Length: 31465


"\n# Creating the X and Y columns \nX = df.drop(columns='user_rating')  # Features\ny = df['user_rating']  # Target variable\n"

In [3]:
print(le.classes_)

['BB & CC Creams' 'Blush' 'Bronzer' 'Color Correcting' 'Concealer'
 'Contouring' 'Face Primer' 'Foundation' 'Highlighter' 'Makeup Remover'
 'Setting Spray & Powder' 'Tinted Moisturizer' nan]


In [4]:
#print(y_train)
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
X_train = X_train.drop(columns=['predictions'], errors='ignore')
X_test = X_test.drop(columns=['predictions'], errors='ignore')
rf = RandomForestClassifier(n_estimators=200,
                            max_features = 3, 
                            max_leaf_nodes=20000,
                            n_jobs=-1, 
                            oob_score=True, 
                            random_state=42, 
                            #min_samples_split = 20, 
                            #min_samples_leaf = 2, 
                            verbose=1)

rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    7.2s finished


In [5]:
# Make predictions on the training and test data
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate accuracy for training and testing sets
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

training_error = 1 - train_accuracy
# Print the errors
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Testing Accuracy: {test_accuracy:.4f}')
oob_error = 1-rf.oob_score_

print(f'OOB Error: {oob_error:.4f}')
print(f'Training Error: {training_error:.4f}')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.6s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s


Training Accuracy: 0.9932
Testing Accuracy: 0.7621
OOB Error: 0.0405
Training Error: 0.0068


[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 200 out of 200 | elapsed:    0.1s finished


In [6]:

# Extract feature importances
importances = rf.feature_importances_
print(set(X_train.columns))
print(len(importances))

feature_names = X_train.columns
importances_names = list((importances))  # Assuming feature importances are indexed

# Find missing or extra features
missing_features = set(importances_names) - set(feature_names)
extra_features = set(feature_names) - set(importances_names)

print("Missing features:", missing_features)
print("Extra features:", extra_features)

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance_df)


{'tan', 'sheer_finish', 'inexpensive', 'skin_concerns_review', 'mother', 'full_coverage_review', 'light_coverage', 'acne_review', 'young', 'num_shades', 'dry_review', 'redness_review', 'num_reviews', 'wrinkles', 'young_review', 'matte_finish', 'light_coverage_review', 'white', 'comfortable_wear', 'professional', 'mother_review', 'wrinkles_review', 'vibe_review', 'expensive', 'dry', 'redness', 'easy_use_review', 'easy_use', 'medium_coverage', 'professional_review', 'category', 'comfortable_wear_review', 'glowy_finish', 'pilling', 'skin_concerns', 'shade_range', 'black', 'full_coverage', 'vibe', 'medium_coverage_review', 'acne', 'tan_review', 'white_review', 'black_review'}
44
Missing features: {0.035902362753612416, 0.03602143129736812, 0.03639186363286234, 0.015594449429350734, 0.012021878856247505, 0.01768362877694302, 0.03534100239062122, 0.037277016370078526, 0.015492726269730313, 0.015196735250866131, 0.01580811150494661, 0.016211547647662704, 0.015826230426846177, 0.03360707894930

In [7]:
# Concatenate true labels and predictions
combined_true_labels = y_test
combined_predictions = test_predictions

# Compute overall confusion matrix
cm_overall = confusion_matrix(combined_true_labels, combined_predictions)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(combined_true_labels), yticklabels=np.unique(combined_true_labels)  )
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

# Print and plot the overall confusion matrix
print("Overall Confusion Matrix (for Testing Data)")
print(cm_overall)
plot_confusion_matrix(cm_overall, 'Overall Confusion Matrix (for Testing Data)')
print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))

'''

# Print classification reports for training and testing sets
print("Training Set Classification Report")
print(classification_report(y_train, train_predictions))

print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))
'''

Overall Confusion Matrix (for Testing Data)
[[  691    33    46    18   328]
 [  177   442    81    22   429]
 [  117    35   659   101   983]
 [   78    32   116  2472  2824]
 [  547   137   182  1199 19716]]
Testing Set Classification Report
              precision    recall  f1-score   support

           1       0.43      0.62      0.51      1116
           2       0.65      0.38      0.48      1151
           3       0.61      0.35      0.44      1895
           4       0.65      0.45      0.53      5522
           5       0.81      0.91      0.86     21781

    accuracy                           0.76     31465
   macro avg       0.63      0.54      0.56     31465
weighted avg       0.75      0.76      0.75     31465



'\n\n# Print classification reports for training and testing sets\nprint("Training Set Classification Report")\nprint(classification_report(y_train, train_predictions))\n\nprint("Testing Set Classification Report")\nprint(classification_report(y_test, test_predictions))\n'

In [8]:
# Add predictions to the DataFrames for easy access
X_train['predictions'] = train_predictions
X_test['predictions'] = test_predictions



# Define a function to plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()

# Function to generate and print confusion matrices for each category
def print_confusion_matrix(cm, classes, title='Confusion Matrix'):
    print(title)
    print("Labels:", classes)
    print("\nConfusion Matrix:")
    print(" " * 10 + " ".join([f"{cls:>5}" for cls in classes]))
    for i, row in enumerate(cm):
        print(f"{classes[i]:>5}", " ".join([f"{val:>5}" for val in row]))

def print_confusion_matrices(df, predictions_col):
    categories = df['category'].unique()
    
    for category in categories:
        print(f"Processing category: {category}")

        # Filter data for the current category
        category_df = df[df['category'] == category].copy() 
        category_df['user_rating'] = user_rating
        y_category = category_df['user_rating']
        predictions_category = category_df[predictions_col]
        
        # Generate confusion matrix
        cm_category = confusion_matrix(y_category, predictions_category, labels=y_category.unique())
        
        # Print confusion matrix and classification report
        print(f"Confusion Matrix for Category '{category}':")
        class_names = [str(i) for i in y_category.unique()]
        print_confusion_matrix(cm_category, class_names)
        print(f"Classification Report for Category '{category}':\n", classification_report(y_category, predictions_category))
        
        # Plot confusion matrix
        #class_names = [str(i) for i in y_category.unique()]
        #plot_confusion_matrix(cm_category, class_names, title=f'Confusion Matrix for Category: {category}')

# Print confusion matrices for training and testing sets
print("Training Set Confusion Matrices")
print_confusion_matrices(X_train, 'predictions')

print("Testing Set Confusion Matrices")
print_confusion_matrices(X_test, 'predictions')


print('DONE')







Training Set Confusion Matrices
Processing category: 5
Confusion Matrix for Category '5':
Confusion Matrix
Labels: ['5', '3', '4', '1', '2']

Confusion Matrix:
              5     3     4     1     2
    5  1026  1361  1402   801  1145
    3    92   157   123    49    75
    4   305   388   384   169   293
    1    56    88   127    29    41
    2    44    81    65    25    33
Classification Report for Category '5':
               precision    recall  f1-score   support

           1       0.03      0.09      0.04       341
           2       0.02      0.13      0.04       248
           3       0.08      0.32      0.12       496
           4       0.18      0.25      0.21      1539
           5       0.67      0.18      0.28      5735

    accuracy                           0.19      8359
   macro avg       0.20      0.19      0.14      8359
weighted avg       0.50      0.19      0.24      8359

Processing category: 7
Confusion Matrix for Category '7':
Confusion Matrix
Labels: ['4', '

In [9]:
import pickle

# Save the model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(rf, file)
