In [9]:
# Imports
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score


In [3]:
# Load datasets
media_contacts = pd.read_csv("../data/media_contacts.csv")
socio_demos = pd.read_csv("../data/socio_demos.csv")

In [4]:
# Merge datasets
data = pd.merge(media_contacts, socio_demos, left_on="PERSON ID", right_on="Person ID")

In [5]:
# Feature Engineering
data['Age'] = 2025 - pd.to_datetime(data['BIRTHDAY'], format='%Y%m%d').dt.year
data['Age_Group'] = pd.cut(data['Age'], bins=[0, 17, 24, 34, 44, 54, 64, 100], 
                           labels=['Under 18', '18-24', '25-34', '35-44', '45-54', '55-64', '65+'])
data['Total_Media_Exposure'] = data[['TV_Total', 'Print_Total', 'Online Total', 'TikTok', 'YouTube_Total']].sum(axis=1)

In [8]:
data

Unnamed: 0,PERSON ID,TV_Total,FLYERS,Print_Total,Online_Video,Online_Display,Online Total,TikTok,YouTube_Total,YouTube_Mobile,...,Purchase,Person ID,weight,BIRTHDAY,Gender,Number_of children,People_in_Household,Age,Age_Group,Total_Media_Exposure
0,5326201,0.0,0.0,0.0,10.0,4.0,14.0,0.0,0.0,0.0,...,1,5326201,11032.10,19971001.0,male,0,2-HH,28.0,25-34,14.0
1,256895,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,...,1,256895,2581.77,19730701.0,female,0,2-HH,52.0,45-54,3.0
2,7894945,4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,7894945,8219.48,19870201.0,male,1 child,3-HH,38.0,35-44,5.0
3,797459,0.0,0.0,0.0,11.0,2.0,13.0,0.0,1.0,1.0,...,1,797459,6040.79,19870401.0,male,2 Children,4-HH,38.0,35-44,14.0
4,5529182,0.0,0.0,0.0,7.0,3.0,10.0,0.0,0.0,0.0,...,0,5529182,11635.16,19971001.0,male,0,2-HH,28.0,25-34,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16043,78420591,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,78420591,6023.43,19460901.0,male,0,1-HH (male),79.0,65+,32.0
16044,1866126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1866126,#NULL!,19710901.0,male,0,3-HH,54.0,45-54,0.0
16045,1728358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1728358,#NULL!,19680201.0,male,0,2-HH,57.0,55-64,0.0
16046,68190368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,68190368,#NULL!,19440301.0,female,0,1 -HH (female),81.0,65+,0.0


In [10]:
# Prepare data
X = data[['Age', 'Gender', 'Number_of children', 'Total_Media_Exposure']]
y = data['Purchase']
X = pd.get_dummies(X, drop_first=True)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

              precision    recall  f1-score   support

           0       0.61      0.68      0.64      1645
           1       0.62      0.54      0.58      1565

    accuracy                           0.61      3210
   macro avg       0.62      0.61      0.61      3210
weighted avg       0.62      0.61      0.61      3210

ROC-AUC: 0.648770113714713
