In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


: 

In [None]:
#check the data dimensions
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
# EDA
print(train.head())

In [None]:
# Check for missing values
print("MISSING VALUES:")
train_missing = train.isnull().sum().sum()
test_missing = test.isnull().sum().sum()
print(f"Train total missing: {train_missing}")
print(f"Test total missing: {test_missing}")

In [None]:
if train_missing > 0:
    print("Train columns with missing values:")
    missing_cols = train.isnull().sum()
    print(missing_cols[missing_cols > 0])

if test_missing > 0:
    print("\nTest columns with missing values:")
    missing_cols = test.isnull().sum()
    print(missing_cols[missing_cols > 0])

# Filling in the missing values with median if any exists
if train_missing > 0 or test_missing > 0:
    print(" Filling missing values with median...")
    train.fillna(train.median(), inplace=True)
    test.fillna(test.median(), inplace=True)
    print(" Missing values filled")
else:
    print(" No missing values found")

In [None]:
# Explore the data more

print("\nTARGET DISTRIBUTION:")
print(train['target'].value_counts())
print(f"Transaction rate: {train['target'].mean():.2%}") # percentage that made a transaction

In [None]:
#creating a dataset of just sales 

train_sale_subset = train[train["target"] == 1]

In [None]:
#train.hist()

10% of people made a transaction. So we have to use AUC as a metric instead of accuracy score. 

In [None]:
train_ids = train['ID_code']
test_ids = test['ID_code']

In [None]:
# Separating features and target
X = train.drop(['ID_code', 'target'], axis=1)
y = train['target']
X_test = test.drop('ID_code', axis=1)

In [None]:
# Feature engineering

X['mean'] = X.mean(axis=1)
X['std'] = X.std(axis=1)
X['max'] = X.max(axis=1)
X['min'] = X.min(axis=1)


print(f"Original features: 200")
print(f"Features: {X.shape[1]}")
print(f"Added: mean, std, max, min")

In [None]:
# Checking for correlation with target
correlations = pd.DataFrame(X).corrwith(y).abs().sort_values(ascending=False)
print("Top 20 features by correlation with target:")
print(correlations.head(20))

The features above have the most correlation with the target variable.

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Target distribution
axes[0, 0].bar(['No Transaction', 'Transaction'], y.value_counts().values)
axes[0, 0].set_title('Target Distribution')
axes[0, 0].set_ylabel('Count')

# Top correlated feature
top_feature = correlations.index[0]
axes[0, 1].hist([X[y==0][top_feature], X[y==1][top_feature]], 
                label=['No Transaction', 'Transaction'], bins=30)
axes[0, 1].set_title(f'{top_feature} by Target')
axes[0, 1].legend()


# Mean feature by target
axes[1, 0].hist([X[y==0]['mean'], X[y==1]['mean']], 
                label=['No Transaction', 'Transaction'], bins=30)
axes[1, 0].set_title('Mean Feature by Target')
axes[1, 0].legend()


# Stabdard deviation feature by target
axes[1, 1].hist([X[y==0]['std'], X[y==1]['std']], 
                label=['No Transaction', 'Transaction'], bins=30)
axes[1, 1].set_title('Std Feature by Target')
axes[1, 1].legend()

In [None]:
# Top features correlation heatmap
top_10 = correlations.head(10).index
corr_matrix = X[top_10].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap - Top 10 Features')

In [None]:
# Create pipeline with different scaling and PCA options
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")


# Using subset of data for faster results
subset_size = 10000  # Use only 10k samples for grid search
X_train_subset, _, y_train_subset, _ = train_test_split(
    X_train, y_train, train_size=subset_size, random_state=42, stratify=y_train
)
print(f"\nUsing subset of {subset_size} samples for grid search")

In [None]:
#might want to rename this later?

preprocessing_pipe = Pipeline([
    ('scaler', StandardScaler()),  # Will be replaced in grid search
    ('pca', PCA()),  # Will be replaced in grid search
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])

# Define parameter grid
param_grid = {
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'pca__n_components': [None, 5, 10, 25, 50, 100],
    'model__C': [0.1, 1]
}


In [None]:
# Grid search on subset
grid_search = GridSearchCV(
    preprocessing_pipe, 
    param_grid, 
    cv=3,  
    scoring='roc_auc',
    verbose=2,
    n_jobs=-1
)

In [None]:

grid_search.fit(X_train_subset, y_train_subset)

print(f"\nBest parameters:")
print(grid_search.best_params_)
print(f"\nBest CV AUC: {grid_search.best_score_:.4f}")

# Evaluate on validation set full
val_auc = roc_auc_score(y_val, grid_search.predict_proba(X_val)[:, 1])
print(f"Validation AUC: {val_auc:.4f}")

# Save best preprocessing pipeline
best_preprocessing = grid_search.best_estimator_

In [None]:
#Trying a Decision Tree Classifier 
#Based off code from my (Mia's) lab 11, and Aarya's code above 

tree_pipe = Pipeline([
    ('scaler', StandardScaler()),  
    ('pca', PCA()),
    ('model', DecisionTreeClassifier())
])

tree_parms = {
    'model__max_depth': [2,3,4],
    'model__min_samples_split': [10,20,40],
    'scaler': [StandardScaler(), MinMaxScaler(), None],
    'pca__n_components': [None, 5, 10, 25, 50, 100]
}

tree_grid_res = GridSearchCV(
    estimator = tree_pipe, 
    param_grid = tree_parms, 
    cv=5, 
    scoring = 'roc_auc').fit(X_train_subset, y_train_subset)

print(tree_grid_res.best_estimator_)
print(tree_grid_res.best_score_)

sale_tree = tree_grid_res.best_estimator_

In [None]:
tree_pipe_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('model', DecisionTreeClassifier(max_depth=4, min_samples_split=10))
]).fit(X_train_subset, y_train_subset)



sale_tree = DecisionTreeClassifier(max_depth=4 )

In [None]:
#Decision tree diagram
#Code adapted from lab 7 
#Used google's AI overview to debug this code

tree_model = tree_pipe_2.named_steps['model']

from sklearn.tree import plot_tree, _tree

plot_tree(
    tree_model,
    # We need to convert the predictor variable names to a list:
    feature_names = X_train_subset.columns.tolist(), 
)
plt.show()

In [None]:
#print tree model

print(tree_model)
print()

In [None]:
#Gonna try re-doing this with less splits 
tree_pipe_3 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('model', DecisionTreeClassifier(max_depth=3, min_samples_split=10))
]).fit(X_train_subset, y_train_subset)

In [None]:
#Decision tree diagram
#Code adapted from lab 7 
#Used google's AI overview to debug this code

tree_model = tree_pipe_3.named_steps['model']

plot_tree(
    tree_model,
    # We need to convert the predictor variable names to a list:
    feature_names = X_train_subset.columns.tolist(), 
)
plt.show()

#seems like variables 2 and 3 are important 

In [None]:
tree_pipe_4 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('model', DecisionTreeClassifier(max_depth=2, min_samples_split=10))
]).fit(X_train_subset, y_train_subset)

In [None]:
#Decision tree diagram
#Code adapted from lab 7 
#Used google's AI overview to debug this code

tree_model = tree_pipe_4.named_steps['model']

plot_tree(
    tree_model,
    # We need to convert the predictor variable names to a list:
    feature_names = X_train_subset.columns.tolist(), 
)
plt.show()


# SVM

In [None]:
#SVM 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

svc_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC())
])

svc_parms = {
    'model__kernel': ['rbf', 'poly', 'linear'],
    'model__C' :  [0.1, 1, 10,], 
    'model__class_weight': [None, 'balanced']
}

svc_grid_res = GridSearchCV(
    estimator = svc_pipe, 
    param_grid = svc_parms, 
    cv=3,
    scoring = 'roc_auc').fit(X_train_subset, y_train_subset)



print(svc_grid_res.best_estimator_)
print(svc_grid_res.best_score_)