In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import xgboost as xgb

# Correct the file path
file_path = r'path\email.csv'

# Read the CSV file
df = pd.read_csv(file_path)


In [None]:
# to have full visibility on dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)


In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Step 2: Data Cleaning
## Drop Email Columns (Not Useful for ML)
if 'email' in df.columns:
    df.drop(columns=['email'], inplace=True)

## Convert Date Columns to Numeric Features
date_cols = [col for col in df.columns if 'date' in col.lower()]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')  # Convert to datetime
    df[col + '_days_since_purchased'] = (pd.Timestamp.today() - df[col]).dt.days  # Create numerical feature
    df.drop(columns=[col], inplace=True)  # Drop original date column

## Handle Missing Values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numerical with median
df.fillna("Unknown", inplace=True)   # Fill categorical with "Unknown"


## Remove Duplicates
df.drop_duplicates(inplace=True)

print("Data Shape After Cleaning:", df.shape)

In [None]:
## Handle Outliers (Winsorization method)
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = np.where(df[col] > df[col].quantile(0.99), df[col].quantile(0.99), df[col])
    df[col] = np.where(df[col] < df[col].quantile(0.01), df[col].quantile(0.01), df[col])

In [None]:
# Step 3: Encode Categorical Variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df[col].nunique() > 50:  # Drop high-cardinality columns
        df.drop(columns=[col], inplace=True)
    elif df[col].nunique() <= 10:  # Label encode low-cardinality categorical variables
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    else:  # One-hot encode medium-cardinality categorical variables
        df = pd.get_dummies(df, columns=[col], drop_first=True)
print("Data Shape After Encoding:", df.shape)

In [None]:
# Step 4: Normalize Numerical Variables
scaler = StandardScaler()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [None]:
# Step 5: Split Data
y = df['TRANSACTION_DONE']  # Target variable (1 = purchase, 0 = no purchase)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

In [None]:
# Step 6: Train Model (XGBoost)
model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

In [None]:
# Step 7: Evaluate Model
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_prob))

In [None]:
# Confusion Matrix Plot
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Step 8: Compute Propensity Score
df['propensity_score'] = model.predict_proba(X)[:, 1]
df.head()

In [None]:
# Save the results
df.to_csv("propensity_scores.csv", index=False)
print("Propensity scores saved!")
