In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("/content/online_shoppers_intention.csv")  # Replace with your actual file path

# Display dataset info
print("Dataset Columns:", df.columns)

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Encode categorical features
categorical_columns = ['Month', 'VisitorType', 'Weekend']
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

# Define features (X) and target (y)
X = df.drop(columns=['Revenue'])  # All features except the target column
y = df['Revenue'].astype(int)  # Ensure it's in integer format (0 or 1)

# Check class distribution
print("\nClass Distribution Before Balancing:\n", y.value_counts())

# Balance dataset using SMOTE if needed
if y.value_counts().min() / y.value_counts().max() < 0.5:  # Only apply if imbalance exists
    smote = SMOTE(random_state=42)
    X, y = smote.fit_resample(X, y)
    print("\nClass Distribution After SMOTE:\n", y.value_counts())

# Split data into training and testing sets (Stratified to maintain balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train Decision Tree Classifier
dt_clf = DecisionTreeClassifier(max_depth=10, min_samples_split=5, class_weight='balanced', random_state=42)
dt_clf.fit(X_train, y_train)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)

# Make predictions
y_pred_dt = dt_clf.predict(X_test)
y_pred_rf = rf_clf.predict(X_test)

# Evaluate Decision Tree
print("\nDecision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

# Evaluate Random Forest
print("\nRandom Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# Confusion matrix visualization
plt.figure(figsize=(6, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Purchase', 'Purchase'], yticklabels=['No Purchase', 'Purchase'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()
