In [None]:
# online_shoppers_prediction.py

import time
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, f1_score

# Optional (requires `imblearn`):
# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline as IMBPipeline

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Step 1: Load dataset
df = pd.read_csv("online_shoppers_intention.csv")

# Step 2: Basic preprocessing
df['Weekend'] = df['Weekend'].astype(int)
df['Revenue'] = df['Revenue'].astype(int)
df['Returning_Visitor'] = np.where(df['VisitorType'] == 'Returning_Visitor', 1, 0)
df.drop(columns=['VisitorType'], inplace=True)
df['Month'] = OrdinalEncoder().fit_transform(df[['Month']])

# Step 3: Feature/Target split
X = df.drop(columns=['Revenue'])
y = df['Revenue']

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 5: Column definitions
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Step 6: Preprocessing pipeline
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols)
])

# Step 7: Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Optional: Uncomment if SMOTE is installed
    # ('smote', SMOTE(random_state=42)),
    ('feature_selection', SelectKBest(score_func=chi2, k=6)),
    ('model', SVC())  # Try RandomForestClassifier() as alternative
])

# Step 8: Model training & evaluation
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("\nMetrics:")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))
