In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('shopping_trends.csv')

# Data preprocessing
# (Handle missing values, encode categorical variables, etc.)
# Handle non-numeric categories using Label Encoding
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'Item Purchased', 'Category', 'Location', 'Size', 'Color', 'Season', 'Subscription Status', 'Payment Method', 'Shipping Type', 'Discount Applied', 'Promo Code Used', 'Preferred Payment Method', 'Frequency of Purchases']  # Add all categorical columns here
df[categorical_columns] = df[categorical_columns].apply(lambda col: label_encoder.fit_transform(col.astype(str)))

# Define features and target variable
features = df.drop(['Customer ID', 'Subscription Status'], axis=1)
target = df['Subscription Status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
logistic_regression = LogisticRegression(random_state=42)
decision_tree = DecisionTreeClassifier(random_state=42)
random_forest = RandomForestClassifier(random_state=42)

# Train and evaluate models
models = [logistic_regression, decision_tree, random_forest]

for model in models:
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    
    print(f"Model: {type(model).__name__}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Classification Report:\n{report}\n")

# Hyperparameter tuning for Random Forest (example)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_rf_model = grid_search.best_estimator_

# Evaluate the best model
best_predictions = best_rf_model.predict(X_test_scaled)
best_accuracy = accuracy_score(y_test, best_predictions)
best_report = classification_report(y_test, best_predictions)

print("Best Random Forest Model after Hyperparameter Tuning:")
print(f"Accuracy: {best_accuracy:.2f}")
print(f"Classification Report:\n{best_report}")


Model: LogisticRegression
Accuracy: 0.83
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.76      0.86       558
           1       0.62      1.00      0.77       222

    accuracy                           0.83       780
   macro avg       0.81      0.88      0.81       780
weighted avg       0.89      0.83      0.83       780


Model: DecisionTreeClassifier
Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.84      0.85       558
           1       0.63      0.67      0.65       222

    accuracy                           0.79       780
   macro avg       0.75      0.76      0.75       780
weighted avg       0.80      0.79      0.80       780


Model: RandomForestClassifier
Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.79      0.86       558
           1       0.63      0.89  