In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
data = pd.read_csv('newDataset.csv')

# Ensure the target variable (Preferred_Product_Category) is a string
data['Preferred_Product_Category'] = data['Preferred_Product_Category'].astype(str)

# Set the target variable (we're assuming 'Preferred_Product_Category' is the label for product category preference)
X = data.drop(columns=['Customer_ID', 'Preferred_Product_Category'])  # Drop non-feature columns
y = data['Preferred_Product_Category']

# Define categorical and numerical columns
categorical_cols = ['Gender', 'Location', 'Preferred_Payment_Method', 'Device_Type', 'Weather',
                   'Brand_Affinities', 'Survey_Responses']
numerical_cols = ['Age', 'Previous_Purchases', 'Frequency_of_Purchases', 'Time_of_Day', 'Browsing_History',
                  'Sentiment_Analysis', 'Device_Engagement_Level']

# Ensure no strings are passed to the scaler
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')  # Convert to numeric if possible

# Create a column transformer with imputers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical values with the most frequent
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
        ]), categorical_cols)
    ])

# Create a pipeline that combines the preprocessor and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure that y_train and y_test are string-encoded
y_train = y_train.astype(str)
y_test = y_test.astype(str)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)
y_pred = y_pred.astype(str)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")

# Now to make product category recommendations, we use the trained model to predict user preferences
# Ensure 'sample_user' matches the column names and data types
sample_user = pd.DataFrame({
    'Age': [20],
    'Gender': ['Male'],
    'Location': ['San Francisco'],
    'Previous_Purchases': [10],
    'Preferred_Payment_Method': ['Credit Card'],
    'Frequency_of_Purchases': [4],
    'Time_of_Day': [13],  # Time in hours (e.g., 3:00 PM)
    'Device_Type': ['Mobile'],
    'Browsing_History': [13],
    'Weather': ['Windy'],
    'Hobbies/Interests': ['Technology'],  # Include all columns expected
    'Sentiment_Analysis': [0.2],  # Positive sentiment
    'Brand_Affinities': ['Rolex'],
    'Survey_Responses': ['Cosmetic'],
    'Device_Engagement_Level': [4]  # Make sure this column exists and has a value
})

# Ensure all columns in sample_user exist in the training data
sample_user = sample_user[categorical_cols + numerical_cols]

# Predict the product category preference for the sample user
user_prediction = pipeline.predict(sample_user)
print(f"Prediction for user: {user_prediction[0]}")


 'Browsing_History' 'Sentiment_Analysis' 'Device_Engagement_Level']. At least one non-missing value is needed for imputation with strategy='mean'.


Accuracy: 0.0
Confusion Matrix:
 [[0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0]]
Prediction for user: Auto Care


 'Browsing_History' 'Sentiment_Analysis' 'Device_Engagement_Level']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Browsing_History' 'Sentiment_Analysis' 'Device_Engagement_Level']. At least one non-missing value is needed for imputation with strategy='mean'.
