In [None]:
!conda install anaconda::seaborn 



In [None]:
import pandas as pd
import psycopg2
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
matplotlib.use('TkAgg')

In [29]:
#load the data fronm a csv

df = pd.read_csv('training_data.csv')
user_rating = df['user_rating']
df = df.drop(columns = ['locale', 'location', 'recommended_ratio', 'overall_product_rating', 'unique_review_id', 'review_link_id'])
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

# Extract unique IDs
unique_ids = df['product_link_id'].unique()

# Randomly split unique IDs into training and testing sets
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)


# Filter the DataFrame based on these IDs
df_train = df[df['product_link_id'].isin(train_ids)]
df_test = df[df['product_link_id'].isin(test_ids)]

max_count = df_train['user_rating'].value_counts().max()
min_count = min(df_train['user_rating'].value_counts().min() * 3, max_count)
df_train = df_train.groupby('user_rating').apply(lambda x: x.sample(min_count, replace=True)).reset_index(drop=True)

# Drop the product_link_id column if not needed for training
X_train = df_train.drop(columns=['product_link_id', 'user_rating'])
y_train = df_train['user_rating']

#Test Data
X_test = df_test.drop(columns=['product_link_id', 'user_rating'])
y_test = df_test['user_rating']

print("X_train Length:", len(X_train))
print("y_train Length:", len(y_train))
print("X_test Length:", len(X_test))
print("y_test Length:", len(y_test))

'''
# Creating the X and Y columns 
X = df.drop(columns='user_rating')  # Features
y = df['user_rating']  # Target variable
'''



X_train Length: 105510
y_train Length: 105510
X_test Length: 65977
y_test Length: 65977


"\n# Creating the X and Y columns \nX = df.drop(columns='user_rating')  # Features\ny = df['user_rating']  # Target variable\n"

In [30]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    6.9s finished


In [31]:
# Make predictions on the training and test data
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate accuracy for training and testing sets
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the errors
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Testing Accuracy: {test_accuracy:.4f}')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s


Training Accuracy: 0.9857
Testing Accuracy: 0.7083


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.2s finished


In [32]:

# Extract feature importances
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance_df)


                    Feature  Importance
3       professional_review    0.038642
21           pilling_review    0.034940
22       shade_range_review    0.034694
17       inexpensive_review    0.033188
4               vibe_review    0.033100
11           redness_review    0.032947
6                dry_review    0.032377
10               tan_review    0.031960
1              young_review    0.031938
16         expensive_review    0.031738
18     skin_concerns_review    0.031696
2             mother_review    0.031598
13    light_coverage_review    0.031523
9              white_review    0.031472
5               acne_review    0.031363
19  comfortable_wear_review    0.030261
8                poc_review    0.030177
12          coverage_review    0.029896
7           wrinkles_review    0.029558
14   medium_coverage_review    0.028768
20          easy_use_review    0.028372
25              num_reviews    0.028199
15     full_coverage_review    0.027987
47              shade_range    0.027544


In [33]:
# Add predictions to the DataFrames for easy access
X_train['predictions'] = train_predictions
X_test['predictions'] = test_predictions



# Define a function to plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()

# Function to generate and print confusion matrices for each category
def print_confusion_matrix(cm, classes, title='Confusion Matrix'):
    print(title)
    print("Labels:", classes)
    print("\nConfusion Matrix:")
    print(" " * 10 + " ".join([f"{cls:>5}" for cls in classes]))
    for i, row in enumerate(cm):
        print(f"{classes[i]:>5}", " ".join([f"{val:>5}" for val in row]))

def print_confusion_matrices(df, predictions_col):
    categories = df['category'].unique()
    
    for category in categories:
        print(f"Processing category: {category}")

        # Filter data for the current category
        category_df = df[df['category'] == category].copy() 
        category_df['user_rating'] = user_rating
        y_category = category_df['user_rating']
        predictions_category = category_df[predictions_col]
        
        # Generate confusion matrix
        cm_category = confusion_matrix(y_category, predictions_category, labels=y_category.unique())
        
        # Print confusion matrix and classification report
        print(f"Confusion Matrix for Category '{category}':")
        class_names = [str(i) for i in y_category.unique()]
        print_confusion_matrix(cm_category, class_names)
        print(f"Classification Report for Category '{category}':\n", classification_report(y_category, predictions_category))
        
        # Plot confusion matrix
        #class_names = [str(i) for i in y_category.unique()]
        #plot_confusion_matrix(cm_category, class_names, title=f'Confusion Matrix for Category: {category}')

# Print confusion matrices for training and testing sets
print("Training Set Confusion Matrices")
print_confusion_matrices(X_train, 'predictions')

print("Testing Set Confusion Matrices")
print_confusion_matrices(X_test, 'predictions')


print('DONE')







Training Set Confusion Matrices
Processing category: 1
Confusion Matrix for Category '1':
Confusion Matrix
Labels: ['1', '5', '4', '3', '2']

Confusion Matrix:
              1     5     4     3     2
    1    48   158   133    65    48
    5  1092  1876  1706  1488  1349
    4   220   440   435   429   367
    3    57   176   137   155   122
    2    25    99   107    49    31
Classification Report for Category '1':
               precision    recall  f1-score   support

           1       0.03      0.11      0.05       452
           2       0.02      0.10      0.03       311
           3       0.07      0.24      0.11       647
           4       0.17      0.23      0.20      1891
           5       0.68      0.25      0.37      7511

    accuracy                           0.24     10812
   macro avg       0.20      0.19      0.15     10812
weighted avg       0.51      0.24      0.30     10812

Processing category: 4
Confusion Matrix for Category '4':
Confusion Matrix
Labels: ['1', '

In [39]:
# Concatenate true labels and predictions
combined_true_labels = y_test
combined_predictions = test_predictions

# Compute overall confusion matrix
cm_overall = confusion_matrix(combined_true_labels, combined_predictions)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(combined_true_labels), yticklabels=np.unique(combined_true_labels)  )
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

# Print and plot the overall confusion matrix
print("Overall Confusion Matrix (for Testing Data)")
print(cm_overall)
plot_confusion_matrix(cm_overall, 'Overall Confusion Matrix (for Testing Data)')
print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))

'''

# Print classification reports for training and testing sets
print("Training Set Classification Report")
print(classification_report(y_train, train_predictions))

print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))
'''

Overall Confusion Matrix (for Testing Data)
[[ 1243   184    70    75  1085]
 [  302   673   149   100   738]
 [  258   166  1447   199  1519]
 [  348   160   228  5308  6090]
 [ 1731   725   542  4577 38060]]
Testing Set Classification Report
              precision    recall  f1-score   support

           1       0.32      0.47      0.38      2657
           2       0.35      0.34      0.35      1962
           3       0.59      0.40      0.48      3589
           4       0.52      0.44      0.47     12134
           5       0.80      0.83      0.82     45635

    accuracy                           0.71     65977
   macro avg       0.52      0.50      0.50     65977
weighted avg       0.71      0.71      0.70     65977



'\n\n# Print classification reports for training and testing sets\nprint("Training Set Classification Report")\nprint(classification_report(y_train, train_predictions))\n\nprint("Testing Set Classification Report")\nprint(classification_report(y_test, test_predictions))\n'