In [1]:
import pandas as pd

#load each dataset
df_marketing = pd.read_csv("marketing_campaign_cleaned.csv")
df_churn = pd.read_csv("customer_churn_cleaned.csv")
df_conversion = pd.read_csv("predict_conversion_cleaned.csv")

print("\n[Marketing Campaign] Columns:")
print(df_marketing.columns.tolist())

print("\n[Customer Churn] Columns:")
print(df_churn.columns.tolist())

print("\n[Predict Conversion] Columns:")
print(df_conversion.columns.tolist())



[Marketing Campaign] Columns:
['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response']

[Customer Churn] Columns:
['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Subscription Type', 'Contract Length', 'Total Spend', 'Last Interaction', 'Churn']

[Predict Conversion] Columns:
['CustomerID', 'Age', 'Gender', 'Income', 'CampaignChannel', 'CampaignType', 'AdSpend', 'ClickThroughRate', 'ConversionRate', 'WebsiteVisits', 'PagesPerVisit', 'TimeOnSite', 'SocialShares', 'EmailOpens', 'EmailClicks', 'PreviousPurchases', 'LoyaltyPoints', 'AdvertisingPlatform', 'Ad

### Marketing Campaign Dataset Target = Response

In [None]:
# Marketing Campaign Logistic Regression
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#load dataset
df_marketing = pd.read_csv("marketing_campaign_cleaned.csv")
target = "Response"   # binary target column (0 = no response, 1 = responded)

#drop ID and date fields that don't add predictive value
df_marketing = df_marketing.drop(columns=["ID", "Dt_Customer"], errors="ignore")

#split into features (X) and target (y)
X = df_marketing.drop(columns=[target])
y = df_marketing[target]

#identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

#preprocessing: impute + scale numeric, impute + one-hot encode categorical
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipe, numeric_cols),
    ("cat", categorical_pipe, categorical_cols)
])

#logistic regression model
log_reg = LogisticRegression(max_iter=1000, solver="liblinear")

#build full pipeline
pipe = Pipeline([
    ("pre", preprocessor),
    ("model", log_reg)
])

#train/test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#fit model
pipe.fit(X_train, y_train)

#evaluate performance
y_pred = pipe.predict(X_test)
print("\n[Marketing Campaign] Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n[Marketing Campaign] Classification Report:")
print(classification_report(y_test, y_pred))



[Marketing Campaign] Confusion Matrix:
[[369   8]
 [ 45  22]]

[Marketing Campaign] Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       377
           1       0.73      0.33      0.45        67

    accuracy                           0.88       444
   macro avg       0.81      0.65      0.69       444
weighted avg       0.87      0.88      0.86       444



### Observations: Marketing 

### Model Performance
- **Accuracy**: 0.88 overall, driven mostly by strong performance on the majority class (non-responders).  
- **Confusion Matrix**:
  - True Negatives (no response predicted correctly): 369  
  - False Positives (predicted response, but no response): 8  
  - False Negatives (missed actual responses): 45  
  - True Positives (response predicted correctly): 22  
- **Precision/Recall**:
  - Class `0` (non-response): Precision = 0.89, Recall = 0.98, F1 = 0.93  
  - Class `1` (response): Precision = 0.73, Recall = 0.33, F1 = 0.45  

### Interpretation
- The model is **highly accurate at identifying non-responders**, but struggles to capture responders: recall for the positive class is only **0.33**, meaning two-thirds of actual responders are missed.  
- This is consistent with the **class imbalance** in the dataset (responders are much fewer). Logistic regression leans toward the dominant class.  
- The relatively high precision (0.73) for responders indicates that when the model does predict a response, it is usually correct — but it does so too rarely.  
- For marketing use cases, this means the model could **conserve resources by avoiding wasted outreach** (low false positives), but at the cost of **missing many potential customers**.  

*Note* ChatGpt used to help summarize Observations



### Customer Churn Dataset Target = Churn  0 = No 1 = Yes 



In [None]:
# Customer Churn Logistic Regression
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#load dataset
df_churn = pd.read_csv("customer_churn_cleaned.csv")
target = "Churn"   # binary target column (0 = retained, 1 = churned)

#drop identifier column
df_churn = df_churn.drop(columns=["CustomerID"], errors="ignore")

#split into features and target
X = df_churn.drop(columns=[target])
y = df_churn[target]

#identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

#preprocessing: impute + scale numeric, impute + one-hot encode categorical
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipe, numeric_cols),
    ("cat", categorical_pipe, categorical_cols)
])

#logistic regression model
log_reg = LogisticRegression(max_iter=1000, solver="liblinear")

#build pipeline
pipe = Pipeline([
    ("pre", preprocessor),
    ("model", log_reg)
])

#train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#fit model
pipe.fit(X_train, y_train)

#evaluate
y_pred = pipe.predict(X_test)
print("\n[Customer Churn] Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n[Customer Churn] Classification Report:")
print(classification_report(y_test, y_pred))



[Customer Churn] Confusion Matrix:
[[34492  3675]
 [ 5731 44269]]

[Customer Churn] Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.90      0.88     38167
         1.0       0.92      0.89      0.90     50000

    accuracy                           0.89     88167
   macro avg       0.89      0.89      0.89     88167
weighted avg       0.89      0.89      0.89     88167



### Observations: Customer Churn

### Model Performance
- **Accuracy**: 0.89 overall, showing strong predictive performance.  
- **Confusion Matrix**:
  - True Negatives (retained predicted correctly): 34,492  
  - False Positives (predicted churn, but retained): 3,675  
  - False Negatives (missed churners): 5,731  
  - True Positives (churn predicted correctly): 44,269  
- **Precision/Recall**:
  - Class `0` (retained): Precision = 0.86, Recall = 0.90, F1 = 0.88  
  - Class `1` (churn): Precision = 0.92, Recall = 0.89, F1 = 0.90  

### Interpretation
- The model performs well on both classes, with **balanced precision and recall**.  
- It captures most churners (recall = 0.89) while also being precise when flagging them (precision = 0.92).  
- The model misses ~5,700 churners, but correctly identifies ~44,000, making it highly useful for **retention strategies**.  
- Performance is significantly stronger than in the **Marketing Campaign dataset**, confirming that churn is **more structured and predictable**.  

*Note* ChatGPT used to help with observation summary 

### Predict Conversion Target = Conversion 

In [None]:
# Predict Conversion Logistic Regression 
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#load dataset
df_conversion = pd.read_csv("predict_conversion_cleaned.csv")
target = "Conversion"   # binary target column (0 = no conversion, 1 = converted)

#drop identifier column
df_conversion = df_conversion.drop(columns=["CustomerID"], errors="ignore")

#split into features and target
X = df_conversion.drop(columns=[target])
y = df_conversion[target]

#identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

#preprocessing: impute + scale numeric, impute + one-hot encode categorical
numeric_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
    ("num", numeric_pipe, numeric_cols),
    ("cat", categorical_pipe, categorical_cols)
])

#logistic regression model
log_reg = LogisticRegression(max_iter=1000, solver="liblinear")

#build pipeline
pipe = Pipeline([
    ("pre", preprocessor),
    ("model", log_reg)
])

#train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

#fit model
pipe.fit(X_train, y_train)

#evaluate
y_pred = pipe.predict(X_test)
print("\n[Predict Conversion] Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n[Predict Conversion] Classification Report:")
print(classification_report(y_test, y_pred))



[Predict Conversion] Confusion Matrix:
[[  34  164]
 [  10 1392]]

[Predict Conversion] Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.17      0.28       198
           1       0.89      0.99      0.94      1402

    accuracy                           0.89      1600
   macro avg       0.83      0.58      0.61      1600
weighted avg       0.88      0.89      0.86      1600



### Observations Predict Conversion 

### Model Performance
- **Accuracy**: 0.89 overall, driven heavily by the dominant positive class (converted).  
- **Confusion Matrix**:
  - True Negatives (no conversion predicted correctly): 34  
  - False Positives (predicted conversion, but no conversion): 164  
  - False Negatives (missed conversions): 10  
  - True Positives (conversion predicted correctly): 1,392  
- **Precision/Recall**:
  - Class `0` (no conversion): Precision = 0.77, Recall = 0.17, F1 = 0.28  
  - Class `1` (conversion): Precision = 0.89, Recall = 0.99, F1 = 0.94  

### Interpretation
- The model is **extremely effective at detecting conversions** (recall = 0.99) while maintaining high precision (0.89).  
- However, it performs poorly on the minority class (no conversions): recall = 0.17 means most non-converters are misclassified as converters.  
- This imbalance skews the overall accuracy upward but **limits business utility**, since identifying non-converting customers is also valuable for targeted strategies.  
- Compared to Marketing Campaign and Churn, this dataset demonstrates the strongest **bias toward the majority class**, highlighting the challenges of imbalanced data.  

*Note* ChatGPT used to help with observation summary section
