In [1]:
import pandas as pd

In [3]:
# Load the datasets
customers_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')

In [4]:
engagements_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')
transactions_df = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')

In [6]:
# Display the first few rows of each dataframe to understand their structure
customers_df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury
1,2,2021-09-08,2023-10-25,,Male,Hillville
2,3,2021-06-01,2022-11-27,,,North Latoyatown
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad
4,5,2022-01-24,2023-06-02,,Male,East Matthewfort


In [7]:
engagements_df.head()

Unnamed: 0,customer_id,number_of_site_visits,number_of_emails_opened,number_of_clicks
0,1,10,15,1
1,2,285,49,51
2,3,192,73,25
3,4,110,30,17
4,5,161,2,7


In [8]:
marketing_df.head()

Unnamed: 0,campaign_id,customer_id,response,promotion_type,campaign_date
0,1,1,No,Buy One Get One,2024-02-29
1,2,1,No,Discount,2024-01-24
2,3,1,No,Free Shipping,2024-03-05
3,4,1,Yes,Buy One Get One,2024-01-10
4,5,2,Yes,Free Shipping,2022-07-08


In [9]:
 transactions_df.head()

Unnamed: 0,transaction_id,customer_id,transaction_date,transaction_amount,product_category
0,1,1,2024-02-03,165.56,Clothing
1,2,1,2024-03-02,699.01,Home Goods
2,3,1,2024-03-12,146.86,Home Goods
3,4,1,2024-01-20,927.46,Electronics
4,5,1,2024-02-25,1395.87,Electronics


In [10]:
# data cleaning
# Convert date columns to datetime format
customers_df['join_date'] = pd.to_datetime(customers_df['join_date'])
customers_df['last_purchase_date'] = pd.to_datetime(customers_df['last_purchase_date'])
transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date'])
marketing_df['campaign_date'] = pd.to_datetime(marketing_df['campaign_date'])

In [11]:
# data cleaning
# Handle missing values
customers_df['age'].fillna(customers_df['age'].median(), inplace=True)
customers_df['gender'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['age'].fillna(customers_df['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['gender'].fillna('Unknown', inplace=True)


In [14]:
# Future engineering
# Calculate Recency, Frequency, Monetary, and Tenure for each customer
current_date = pd.to_datetime('2024-07-09')

# Recency: Days since the last purchase
customers_df['recency'] = (current_date - customers_df['last_purchase_date']).dt.days

# Frequency: Number of transactions
frequency_df = transactions_df.groupby('customer_id').size().reset_index(name='frequency')

# Monetary: Total amount spent
monetary_df = transactions_df.groupby('customer_id')['transaction_amount'].sum().reset_index(name='monetary')




In [16]:
# Tenure: Days since the first purchase
customers_df['tenure'] = (current_date - customers_df['join_date']).dt.days

In [17]:
# Merging the calculated features back to the customers_df
customers_df = customers_df.merge(frequency_df, on='customer_id', how='left')
customers_df = customers_df.merge(monetary_df, on='customer_id', how='left')

In [18]:
# Fill NaN values in frequency and monetary columns with 0
customers_df['frequency'].fillna(0, inplace=True)
customers_df['monetary'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['frequency'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['monetary'].fillna(0, inplace=True)


In [21]:
customers_df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,recency,tenure,frequency_x,monetary_x,frequency_y,monetary_y,frequency,monetary
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,114,232,6,3509.48,6,3509.48,6,3509.48
1,2,2021-09-08,2023-10-25,44.0,Male,Hillville,258,1035,9,6081.32,9,6081.32,9,6081.32
2,3,2021-06-01,2022-11-27,44.0,Unknown,North Latoyatown,590,1134,6,1454.87,6,1454.87,6,1454.87
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,677,920,20,7874.68,20,7874.68,20,7874.68
4,5,2022-01-24,2023-06-02,44.0,Male,East Matthewfort,403,897,24,15524.55,24,15524.55,24,15524.55


In [22]:
# Aggregate engagement metrics
engagements_agg = engagements_df.groupby('customer_id').agg({
    'number_of_site_visits': 'sum',
    'number_of_emails_opened': 'sum',
    'number_of_clicks': 'sum'
}).reset_index()

# Merge the aggregated engagement metrics back to the customers_df
customers_df = customers_df.merge(engagements_agg, on='customer_id', how='left')

# Fill NaN values in engagement metrics with 0
customers_df['number_of_site_visits'].fillna(0, inplace=True)
customers_df['number_of_emails_opened'].fillna(0, inplace=True)
customers_df['number_of_clicks'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['number_of_site_visits'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['number_of_emails_opened'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate objec

In [23]:
customers_df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,recency,tenure,frequency_x,monetary_x,frequency_y,monetary_y,frequency,monetary,number_of_site_visits,number_of_emails_opened,number_of_clicks
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,114,232,6,3509.48,6,3509.48,6,3509.48,10,15,1
1,2,2021-09-08,2023-10-25,44.0,Male,Hillville,258,1035,9,6081.32,9,6081.32,9,6081.32,285,49,51
2,3,2021-06-01,2022-11-27,44.0,Unknown,North Latoyatown,590,1134,6,1454.87,6,1454.87,6,1454.87,192,73,25
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,677,920,20,7874.68,20,7874.68,20,7874.68,110,30,17
4,5,2022-01-24,2023-06-02,44.0,Male,East Matthewfort,403,897,24,15524.55,24,15524.55,24,15524.55,161,2,7


In [24]:
# Aggregate marketing metrics
# Calculate the total number of campaigns each customer was targeted in
campaigns_count = marketing_df.groupby('customer_id').size().reset_index(name='number_of_campaigns')

# Calculate the response rate (percentage of campaigns the customer responded to)
response_rate = marketing_df.groupby('customer_id')['response'].apply(lambda x: (x == 'Yes').mean()).reset_index(name='response_rate')

# Merge the aggregated marketing metrics back to the customers_df
customers_df = customers_df.merge(campaigns_count, on='customer_id', how='left')
customers_df = customers_df.merge(response_rate, on='customer_id', how='left')

# Fill NaN values in marketing metrics with 0
customers_df['number_of_campaigns'].fillna(0, inplace=True)
customers_df['response_rate'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['number_of_campaigns'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers_df['response_rate'].fillna(0, inplace=True)


In [26]:
customers_df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,recency,tenure,frequency_x,monetary_x,frequency_y,monetary_y,frequency,monetary,number_of_site_visits,number_of_emails_opened,number_of_clicks,number_of_campaigns,response_rate
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,114,232,6,3509.48,6,3509.48,6,3509.48,10,15,1,4,0.25
1,2,2021-09-08,2023-10-25,44.0,Male,Hillville,258,1035,9,6081.32,9,6081.32,9,6081.32,285,49,51,4,0.5
2,3,2021-06-01,2022-11-27,44.0,Unknown,North Latoyatown,590,1134,6,1454.87,6,1454.87,6,1454.87,192,73,25,2,0.5
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,677,920,20,7874.68,20,7874.68,20,7874.68,110,30,17,4,0.25
4,5,2022-01-24,2023-06-02,44.0,Male,East Matthewfort,403,897,24,15524.55,24,15524.55,24,15524.55,161,2,7,4,0.0


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [38]:
import seaborn as sns
import matplotlib.pyplot as plt

In [39]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

In [48]:
customers_df['monetary_per_year'] = (customers_df['monetary'] / customers_df['tenure'] * 360)

In [53]:
customers_df['visit_per_year'] = (customers_df['number_of_site_visits'] / customers_df['tenure'] * 360)
customers_df['click_per_year'] = (customers_df['number_of_clicks'] / customers_df['tenure'] * 360)



In [61]:
customers_df['frequency_year'] = (customers_df['frequency'] / customers_df['tenure'] * 360)

In [62]:
customers_df.head()

Unnamed: 0,customer_id,join_date,last_purchase_date,age,gender,location,recency,tenure,frequency_x,monetary_x,...,monetary,number_of_site_visits,number_of_emails_opened,number_of_clicks,number_of_campaigns,response_rate,monetary_per_year,visit_per_year,click_per_year,frequency_year
0,1,2023-11-20,2024-03-17,56.0,Female,North Shannonbury,114,232,6,3509.48,...,3509.48,10,15,1,4,0.25,5445.744828,15.517241,1.551724,9.310345
1,2,2021-09-08,2023-10-25,44.0,Male,Hillville,258,1035,9,6081.32,...,6081.32,285,49,51,4,0.5,2115.241739,99.130435,17.73913,3.130435
2,3,2021-06-01,2022-11-27,44.0,Unknown,North Latoyatown,590,1134,6,1454.87,...,1454.87,192,73,25,2,0.5,461.863492,60.952381,7.936508,1.904762
3,4,2022-01-01,2022-09-01,29.0,Male,Grossstad,677,920,20,7874.68,...,7874.68,110,30,17,4,0.25,3081.396522,43.043478,6.652174,7.826087
4,5,2022-01-24,2023-06-02,44.0,Male,East Matthewfort,403,897,24,15524.55,...,15524.55,161,2,7,4,0.0,6230.588629,64.615385,2.809365,9.632107


In [106]:
# Normalize numerical features
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
numerical_features = ['recency', 'frequency_year', 'monetary', 'tenure', 'number_of_site_visits', 'number_of_emails_opened', 'number_of_clicks', 'number_of_campaigns']
customers_df[numerical_features] = scaler.fit_transform(customers_df[numerical_features])

In [110]:

# Prepare the features and target variable
model = ols("monetary_per_year ~ recency + tenure + number_of_site_visits + response_rate ", data=customers_df).fit()





In [111]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      monetary_per_year   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.271
Method:                 Least Squares   F-statistic:                     932.6
Date:                Wed, 10 Jul 2024   Prob (F-statistic):               0.00
Time:                        11:48:48   Log-Likelihood:            -1.0598e+05
No. Observations:               10000   AIC:                         2.120e+05
Df Residuals:                    9995   BIC:                         2.120e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept              6493.95

In [109]:
model.params

Intercept                 6581.219017
recency                     -3.187827
tenure                     -72.660641
frequency_year           10996.599179
number_of_site_visits        4.365302
dtype: float64