In [2]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer


In [3]:
# 📌 Step 2: Load Dataset
df = pd.read_excel("skincare1000.xlsx")  # Correct path to file
df.drop('ID', axis=1, inplace=True)      # Drop ID column
df.head()


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

# Replace NaN with an empty string to avoid .split() errors
df['Skin Concerns'] = df['Skin Concerns'].fillna('None')

# Split and strip each concern safely
df['Skin Concerns'] = df['Skin Concerns'].apply(lambda x: [i.strip() for i in x.split(',')])

# MultiLabelBinarizer to one-hot encode
mlb = MultiLabelBinarizer()
skin_concerns_encoded = pd.DataFrame(mlb.fit_transform(df['Skin Concerns']), columns=mlb.classes_)

# Combine with original data
df = df.drop('Skin Concerns', axis=1)
df = pd.concat([df.reset_index(drop=True), skin_concerns_encoded.reset_index(drop=True)], axis=1)

df.head()


Unnamed: 0,Skin Type,Age,Gender,Product Used,Soap/Brand Used,Product Category,Satisfaction (1-5),Skin Tone,Climate,Acne,...,Itching,Mild Pigmentation,None,Oiliness,Pigmentation,Pores,Rash,Redness,Scaling,Uneven Texture
0,Combination,31,Male,Eucerin Advanced Repair,Khadi Neem-Tulsi,Cleanser,5,Brown,Cold,1,...,0,0,0,0,0,0,0,0,0,0
1,Combination,35,Male,La Roche-Posay Effaclar,Dettol Soap,Cleanser,4,Fair,Tropical,0,...,0,0,0,0,0,0,0,0,1,0
2,Combination,33,Female,Simple Hydrating Gel,Lifebuoy Care,Cream,4,Medium,Humid,0,...,0,1,0,0,0,0,0,0,0,0
3,Combination,27,Female,Neutrogena Hydro Boost,Mamaearth Ubtan,Face Wash,5,Fair,Humid,0,...,0,1,0,0,0,0,0,0,0,0
4,Oily,18,Female,Aveeno Daily Moisture,Dove Men+Care,Cleanser,4,Fair,Tropical,0,...,1,0,0,0,0,0,0,0,0,0


In [15]:
# 📌 Step 4: Label Encoding for Categorical Columns
categorical_cols = ['Skin Type', 'Gender', 'Product Used', 'Soap/Brand Used',
                    'Product Category', 'Skin Tone', 'Climate']

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [17]:
# 📌 Step 5: Train ML Model
X = df.drop('Satisfaction (1-5)', axis=1)
y = df['Satisfaction (1-5)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.275
📊 Classification Report:
               precision    recall  f1-score   support

           3       0.20      0.26      0.23        58
           4       0.36      0.41      0.38        78
           5       0.23      0.12      0.16        64

    accuracy                           0.28       200
   macro avg       0.26      0.26      0.26       200
weighted avg       0.27      0.28      0.27       200



In [19]:
# 📌 Step 6: Save Model & Encoders for Streamlit
joblib.dump(model, "model_rf.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(mlb, "mlb_skin_concerns.pkl")

print("✅ All models and encoders saved successfully.")


✅ All models and encoders saved successfully.


In [21]:
import joblib
joblib.dump(model, "model_rf.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(mlb, "mlb_skin_concerns.pkl")


['mlb_skin_concerns.pkl']