# Kickstarter Success Prediction with SMOTE, Pipelines, Logistic Regression & ANN

This notebook demonstrates a full ML workflow for Kickstarter campaign success prediction using both classical and deep learning models.

## 1️⃣ Imports

In [None]:
# 1️⃣ Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

## 2️⃣ Load Data

In [None]:
# 2️⃣ Load Data
data = pd.read_csv(r'E:\ML_Project\kickstarter-success-prediction\data\ks-projects-201801.csv')

## 3️⃣ Initial Cleaning

In [None]:
# 3️⃣ Initial Cleaning
data = data.drop(['ID', 'name'], axis=1)

# Drop rows where state is not failed or successful
data = data[data['state'].isin(['failed', 'successful'])].reset_index(drop=True)

# Drop leakage columns — info only available after campaign ends
leakage_cols = ['pledged', 'usd pledged', 'usd_pledged_real', 'backers']
data = data.drop(leakage_cols, axis=1)

## 4️⃣ Feature Engineering

In [None]:
# 4️⃣ Feature Engineering

# Parse dates
data['launched_dt'] = pd.to_datetime(data['launched'])
data['deadline_dt'] = pd.to_datetime(data['deadline'])

# Campaign duration in days
data['duration_days'] = (data['deadline_dt'] - data['launched_dt']).dt.days

# Launch day of week and hour
data['launch_dayofweek'] = data['launched_dt'].dt.dayofweek
data['launch_hour'] = data['launched_dt'].dt.hour

# Drop original datetime columns
data = data.drop(['launched', 'deadline', 'launched_dt', 'deadline_dt'], axis=1)

# Encode target
data['state'] = data['state'].apply(lambda x: 1 if x == 'successful' else 0)

## 5️⃣ Separate Features & Target

In [None]:
# 5️⃣ Separate Features & Target
X = data.drop('state', axis=1)
y = data['state']

## 6️⃣ Identify categorical and numeric columns

In [None]:
# 6️⃣ Identify categorical and numeric columns
cat_cols = ['category', 'main_category', 'currency', 'country']
num_cols = [col for col in X.columns if col not in cat_cols]

## 7️⃣ Train-Test Split

In [None]:
# 7️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y)

## 8️⃣ Preprocessing Pipeline (OneHot + Scaling)

In [None]:
# 8️⃣ Preprocessing Pipeline (OneHot + Scaling)
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols)
])

In [None]:
# Fit and transform train data
X_train_preproc = preprocessor.fit_transform(X_train)
X_test_preproc = preprocessor.transform(X_test)

## 9️⃣ Apply SMOTE to balance classes after preprocessing

In [None]:
# 9️⃣ Apply SMOTE to balance classes after preprocessing
# We create a pipeline without oversampling first just to transform features for SMOTE

# SMOTE (oversample minority class in training set)
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_preproc, y_train)

print(f'Before SMOTE: {np.bincount(y_train)}')
print(f'After SMOTE: {np.bincount(y_train_bal)}')

## 10️⃣ Logistic Regression Model

In [None]:
# 10️⃣ Logistic Regression Model

lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train_bal, y_train_bal)

# Predict and Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test_preproc)
y_pred_prob_lr = lr_model.predict_proba(X_test_preproc)[:,1]

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve LR
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

plt.plot(fpr_lr, tpr_lr, label=f'LogReg AUC = {roc_auc_lr:.3f}')
plt.plot([0,1],[0,1],'--', color='gray')
plt.title('Logistic Regression ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid()
plt.show()

## 11️⃣ Build and Train ANN Model

In [None]:
# 11️⃣ Build and Train ANN Model

input_shape = X_train_bal.shape[1]

ann_model = models.Sequential([
    layers.InputLayer(input_shape=(input_shape,)),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

ann_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = ann_model.fit(
    X_train_bal,
    y_train_bal,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    callbacks=[early_stop],
    verbose=2
)

## 12️⃣ ANN Evaluate on Test Set

In [None]:
# 12️⃣ ANN Evaluate on Test Set

y_pred_prob_ann = ann_model.predict(X_test_preproc).ravel()
y_pred_ann = (y_pred_prob_ann > 0.5).astype(int)

print("ANN Classification Report:")
print(classification_report(y_test, y_pred_ann))

cm_ann = confusion_matrix(y_test, y_pred_ann)
sns.heatmap(cm_ann, annot=True, fmt='d', cmap='Greens')
plt.title('ANN Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve ANN
fpr_ann, tpr_ann, _ = roc_curve(y_test, y_pred_prob_ann)
roc_auc_ann = auc(fpr_ann, tpr_ann)

plt.plot(fpr_ann, tpr_ann, label=f'ANN AUC = {roc_auc_ann:.3f}', color='green')
plt.plot(fpr_lr, tpr_lr, label=f'LogReg AUC = {roc_auc_lr:.3f}', color='blue', linestyle='--')
plt.plot([0,1],[0,1],'--', color='gray')
plt.title('ROC Curve Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid()
plt.show()

## 13️⃣ Plot ANN Training History

In [None]:
# 13️⃣ Plot ANN Training History
pd.DataFrame(history.history)[['loss','val_loss']].plot(title='ANN Loss')
plt.show()

pd.DataFrame(history.history)[['accuracy','val_accuracy']].plot(title='ANN Accuracy')
plt.show()