In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv("./data/supermarket_sales.csv")

In [None]:
# Handle missing values (if any)
df.fillna(method='ffill', inplace=True)

# Encode categorical features
le = LabelEncoder()
df['Customer type'] = le.fit_transform(df['Customer type'])
df['Gender'] = le.fit_transform(df['Gender'])
df['Product line'] = le.fit_transform(df['Product line'])
df['Branch'] = le.fit_transform(df['Branch'])
df['City'] = le.fit_transform(df['City'])
df['Payment'] = le.fit_transform(df['Payment'])

# Create binary target variable
df['Above_average'] = (df['Total'] > df['Total'].mean()).astype(int)

# Drop unnecessary columns (if any)
df.drop(['Invoice ID', 'Date', 'Time'], axis=1, inplace=True)

In [None]:
X = df.drop('Above_average', axis=1)
y = df['Above_average']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Logistic Regression
model1 = LogisticRegression()
model1.fit(X_train, y_train)

# Decision Tree
model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)

# Random Forest
model3 = RandomForestClassifier()
model3.fit(X_train, y_train)

# SVM
model4 = SVC()
model4.fit(X_train, y_train)

In [None]:
models = [model1, model2, model3, model4]
model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-score: {f1:.2f}")
    print()

In [None]:
# Example using Random Forest
importances = model3.feature_importances_
features = X.columns
sorted_indices = np.argsort(importances)[::-1]

for i in sorted_indices:
    print(f"{features[i]}: {importances[i]:.2f}")