In [None]:
!conda install anaconda::seaborn 



In [1]:
import pandas as pd
import psycopg2
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import seaborn as sns
matplotlib.use('TkAgg')

In [2]:
#load the data fronm a csv

df = pd.read_csv('training_data.csv')
user_rating = df['user_rating']
df = df.drop(columns = ['locale', 'location', 'recommended_ratio', 'overall_product_rating', 'unique_review_id', 'review_link_id'
                       , 'inexpensive_review', 'shade_range_review', 'pilling_review', 'expensive_review', 'helpful_score' ])
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

# Extract unique IDs
unique_ids = df['product_link_id'].unique()

# Randomly split unique IDs into training and testing sets
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42)


# Filter the DataFrame based on these IDs
df_train = df[df['product_link_id'].isin(train_ids)]
df_test = df[df['product_link_id'].isin(test_ids)]

max_count = df_train['user_rating'].value_counts().max()
min_count = min(df_train['user_rating'].value_counts().min() * 3, max_count)
df_train = df_train.groupby('user_rating').apply(lambda x: x.sample(min_count, replace=True)).reset_index(drop=True)

# Drop the product_link_id column if not needed for training
X_train = df_train.drop(columns=['product_link_id', 'user_rating'])
y_train = df_train['user_rating']

#Test Data
X_test = df_test.drop(columns=['product_link_id', 'user_rating'])
y_test = df_test['user_rating']

print("X_train Length:", len(X_train))
print("y_train Length:", len(y_train))
print("X_test Length:", len(X_test))
print("y_test Length:", len(y_test))

'''
# Creating the X and Y columns 
X = df.drop(columns='user_rating')  # Features
y = df['user_rating']  # Target variable
'''



X_train Length: 105510
y_train Length: 105510
X_test Length: 65977
y_test Length: 65977


"\n# Creating the X and Y columns \nX = df.drop(columns='user_rating')  # Features\ny = df['user_rating']  # Target variable\n"

In [3]:
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1)
rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.6s finished


In [4]:
# Make predictions on the training and test data
train_predictions = rf.predict(X_train)
test_predictions = rf.predict(X_test)

# Calculate accuracy for training and testing sets
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

# Print the errors
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Testing Accuracy: {test_accuracy:.4f}')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s


Training Accuracy: 0.9844
Testing Accuracy: 0.7110


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished


In [5]:

# Extract feature importances
importances = rf.feature_importances_
print(set(X_train.columns))
print(len(importances))

feature_names = X_train.columns
importances_names = list((importances))  # Assuming feature importances are indexed

# Find missing or extra features
missing_features = set(importances_names) - set(feature_names)
extra_features = set(feature_names) - set(importances_names)

print("Missing features:", missing_features)
print("Extra features:", extra_features)

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance_df)


{'vibe_review', 'acne_review', 'coverage', 'young_review', 'easy_use_review', 'skin_concerns', 'mother_review', 'redness_review', 'skin_concerns_review', 'white_review', 'wrinkles', 'light_coverage_review', 'comfortable_wear', 'redness', 'young', 'white', 'professional_review', 'tan', 'full_coverage_review', 'vibe', 'professional', 'dry', 'inexpensive', 'shade_range', 'full_coverage', 'comfortable_wear_review', 'category', 'easy_use', 'poc', 'medium_coverage_review', 'mother', 'medium_coverage', 'dry_review', 'pilling', 'num_shades', 'acne', 'light_coverage', 'expensive', 'wrinkles_review', 'num_reviews', 'poc_review', 'coverage_review', 'tan_review'}
43
Missing features: {0.03650476129752367, 0.035024016497486654, 0.010998862718929584, 0.010844000199609466, 0.009612719597005109, 0.039592772490114066, 0.038016602710894916, 0.013601334936389885, 0.041408571868998614, 0.04484648928502351, 0.010201314417983068, 0.010857793517292122, 0.009018055631403307, 0.0364115634102238, 0.035646797894

In [None]:
# Concatenate true labels and predictions
combined_true_labels = y_test
combined_predictions = test_predictions

# Compute overall confusion matrix
cm_overall = confusion_matrix(combined_true_labels, combined_predictions)

# Function to plot confusion matrix
def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(combined_true_labels), yticklabels=np.unique(combined_true_labels)  )
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

# Print and plot the overall confusion matrix
print("Overall Confusion Matrix (for Testing Data)")
print(cm_overall)
plot_confusion_matrix(cm_overall, 'Overall Confusion Matrix (for Testing Data)')
print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))

'''

# Print classification reports for training and testing sets
print("Training Set Classification Report")
print(classification_report(y_train, train_predictions))

print("Testing Set Classification Report")
print(classification_report(y_test, test_predictions))
'''

Overall Confusion Matrix (for Testing Data)
[[ 1131   229    93    73  1131]
 [  291   688   107   140   736]
 [  243   181  1451   219  1495]
 [  291   160   258  5276  6149]
 [ 1623   741   577  4333 38361]]


In [None]:
# Add predictions to the DataFrames for easy access
X_train['predictions'] = train_predictions
X_test['predictions'] = test_predictions



# Define a function to plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()

# Function to generate and print confusion matrices for each category
def print_confusion_matrix(cm, classes, title='Confusion Matrix'):
    print(title)
    print("Labels:", classes)
    print("\nConfusion Matrix:")
    print(" " * 10 + " ".join([f"{cls:>5}" for cls in classes]))
    for i, row in enumerate(cm):
        print(f"{classes[i]:>5}", " ".join([f"{val:>5}" for val in row]))

def print_confusion_matrices(df, predictions_col):
    categories = df['category'].unique()
    
    for category in categories:
        print(f"Processing category: {category}")

        # Filter data for the current category
        category_df = df[df['category'] == category].copy() 
        category_df['user_rating'] = user_rating
        y_category = category_df['user_rating']
        predictions_category = category_df[predictions_col]
        
        # Generate confusion matrix
        cm_category = confusion_matrix(y_category, predictions_category, labels=y_category.unique())
        
        # Print confusion matrix and classification report
        print(f"Confusion Matrix for Category '{category}':")
        class_names = [str(i) for i in y_category.unique()]
        print_confusion_matrix(cm_category, class_names)
        print(f"Classification Report for Category '{category}':\n", classification_report(y_category, predictions_category))
        
        # Plot confusion matrix
        #class_names = [str(i) for i in y_category.unique()]
        #plot_confusion_matrix(cm_category, class_names, title=f'Confusion Matrix for Category: {category}')

# Print confusion matrices for training and testing sets
print("Training Set Confusion Matrices")
print_confusion_matrices(X_train, 'predictions')

print("Testing Set Confusion Matrices")
print_confusion_matrices(X_test, 'predictions')


print('DONE')





