In [132]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor, LGBMClassifier
import warnings

pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

In [2]:
#import dataset with only social for predicting CTR and CVRSV

labeled_ml_data = pd.read_csv(r"C:/Users/weswa/Downloads/KPIcsvs/labeled_ml.csv")
raw_data = pd.read_csv(r"C:/Users/weswa/Downloads/KPIcsvs/KPI_Project_Data_0120_1221.csv")
social_data = raw_data[raw_data.channel == 'Social'].drop(columns=['channel', 'size'])

In [3]:
#Import dataset with all channels to predict CVRBF and CVRS
ml_data_2 = pd.read_csv(r"C:/Users/weswa/Downloads/KPIcsvs/ml_all_data.csv")
ml_data_2.drop(['Creative ID', 'offer_group', 'asset_type', 'size', 'discount'], axis=1, inplace=True)

In [4]:
#Merging dataframes
social_data = social_data.iloc[:,:16]

to_drop = ['funnel', 'publisher', 'lob', 'product',	'theme', 'kpi_audience', 'creative_versions',
'price', 'price_placement', 'discount', 'offer_placement', 'video_type', 'length']

# Merge away outliers and concatenate target features
ml_data_1 = pd.merge(social_data, labeled_ml_data.drop(to_drop, axis=1), left_on='Creative ID', right_on='Creative ID')
ml_data_1.drop(['Creative ID', 'offer_group', 'asset_type'], axis=1, inplace=True)

In [5]:
# Object to category
cat_cols = ['funnel', 'publisher', 'lob', 'product', 'theme', 'kpi_audience', 'creative_versions',
'price', 'price_placement', 'offer_placement', 'video_type', 'length']

ml_data_1[cat_cols] = ml_data_1[cat_cols].apply(lambda x: x.astype('category'))
ml_data_1[cat_cols] = ml_data_1[cat_cols].apply(lambda x: x.cat.codes.astype('category').astype('category'))

In [158]:
#Seperate data by funnel
upper_funnel = ml_data_1[ml_data_1.funnel == 2]
middle_funnel = ml_data_1[ml_data_1.funnel == 1]
lower_funnel = ml_data_1[ml_data_1.funnel == 0]



In [177]:
#Predict Average CTR
X = upper_funnel.loc[:,['funnel', 'publisher', 'kpi_audience', 'video_type', 'theme', 'creative_versions', 'price']]
y = upper_funnel.iloc[:, 14:]

# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Columns to encode
one_hot = ['publisher', 'kpi_audience', 'theme', 'creative_versions']

# Preprocessing
ct = make_column_transformer(
    (OrdinalEncoder(), ['funnel']),
    (OneHotEncoder(handle_unknown='ignore'), one_hot),
    remainder='passthrough'
)

# Pipeline
rf = RandomForestRegressor(n_estimators = 100, max_depth = 10, min_samples_leaf=15, random_state=123)
pipe = make_pipeline(ct, rf)

# Model evaluation
scorers = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']

cv = cross_validate(pipe, X_train, y_train['Mean_CTR'], cv=5, scoring=scorers, error_score='raise')
r2 = cv.get('test_r2').mean().round(2)
mse = -1*cv.get('test_neg_mean_squared_error').mean().round(3)
mae = -1*cv.get('test_neg_mean_absolute_error').mean().round(3)
print('Mean CTR')
print(f'R2: {r2}\nMSE: {mse}\nMAE: {mae}\n')



pipe.fit(X_train,y_train['Mean_CTR'])

y_predict = pipe.predict(X_test)
#print('These are predicted values: ', y_predict)
print('These are the corresponding actual values: ')

print(np.array(y_test['Mean_CTR']))

score=r2_score(y_test['Mean_CTR'], y_predict)
print("R2 is: ", score)
print("mean squared error is: ", mean_squared_error(y_test['Mean_CTR'], y_predict))
print("mean absolute error is: ", mean_absolute_error(y_test['Mean_CTR'], y_predict))

Mean CTR
R2: 0.65
MSE: 0.009
MAE: 0.054

These are the corresponding actual values: 
[0.34       0.2664     0.07333333 0.24       0.02333333 0.21666667
 0.22666667 0.07047619 0.05       0.61823529 0.4        0.276
 0.282      0.202      0.075      0.07555556 0.075      0.295
 0.76       0.05       0.055      0.07333333 0.105      0.07857143
 0.03333333 0.27888889 0.056      0.02       0.268      0.045
 0.21142857 0.076      0.47       0.67666667 0.04666667 0.06571429
 0.74727273 0.30666667 0.09       0.07333333 0.34166667 0.1575
 0.089      0.06142857 0.106      1.38       0.0726087  0.07357143
 0.13       0.078      0.14823529 0.07181818 0.158      0.078
 0.32       0.02       0.27       0.095      0.26666667 0.07
 0.06142857 0.43       0.26       0.06090909 0.21142857 0.06428571
 0.185      0.07       0.1775     0.05708333 0.056      0.12913043
 0.38       0.325      0.12352941 0.22666667 0.06333333 0.10333333
 0.29363636 0.055      0.05       0.11714286 0.35444444 0.27571429
 0.1285

In [139]:
#Predict Average CVRSV
X = middle_funnel.iloc[:,:14]
y = middle_funnel.iloc[:, 14:]

# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Columns to encode
one_hot = ['publisher', 'kpi_audience', 'theme', 'creative_versions']

# Preprocessing
ct = make_column_transformer(
    #(SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True), imp_cols),
    (OrdinalEncoder(), ['funnel']),
    (OneHotEncoder(handle_unknown='ignore'), one_hot),
    remainder='passthrough'
)

# Pipeline
rf = RandomForestRegressor(n_estimators = 200, max_depth = 5,  min_samples_leaf=3 , random_state=123)
pipe = make_pipeline(ct, rf)

# Model evaluation
scorers = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']

# Site Visit CVR
cv = cross_validate(pipe, X_train, y_train['Mean_CVRSV'], cv=5, scoring=scorers, error_score='raise')
r2 = cv.get('test_r2').mean()
mse = -1*cv.get('test_neg_mean_squared_error').mean()
mae = -1*cv.get('test_neg_mean_absolute_error').mean()
print('Mean Site Visit CVR')
print(f'R2: {r2}\nMSE: {mse}\nMAE: {mae}\n')



pipe.fit(X_train,y_train['Mean_CVRSV'])

y_predict = pipe.predict(X_test)
print('These are predicted values: ', y_predict[0:5])
print('These are the corresponding actual values: ')

print(np.array(y_test['Mean_CVRSV']))

score=r2_score(y_test['Mean_CVRSV'], y_predict)
print("R2 is: ", score)
print("mean squared error is: ", mean_squared_error(y_test['Mean_CVRSV'], y_predict))
print("mean absolute error is: ", mean_absolute_error(y_test['Mean_CVRSV'], y_predict))

Mean Site Visit CVR
R2: 0.6678026847297323
MSE: 0.03222871084807666
MAE: 0.12170563308320108

These are predicted values:  [0.55405229 0.28868996 0.63212598 0.12014102 0.05391606]
These are the corresponding actual values: 
[0.39       0.321      0.79222222 0.114      0.12       0.123
 0.66       0.1825     0.102      0.815      0.         0.3325
 0.35333333 0.407      0.16863636 0.515      0.365      0.23545455
 0.9375     1.10666667 0.11090909 0.03       0.199      0.32
 0.165      0.535      0.33125    0.955      0.22285714 0.426
 0.71       0.         0.057      0.34       0.48       0.3575
 0.3225     0.69       0.61       0.         0.83       0.4575
 0.9        0.555      0.78       0.735      0.21857143 0.08
 0.225      0.6625     0.468      0.785      0.03       0.789
 0.44       0.5475     0.24       0.         0.64       0.073
 0.8525     0.8575     0.39       0.03933333 0.167      0.634
 0.64454545 0.055      0.064      0.36428571 0.05666667 0.645
 0.505      0.32       0.2

In [17]:
#Switch to ml_data_2 for predicting CVRBF and CVRS

# Object to category
cat_cols = ['channel', 'funnel', 'publisher', 'lob', 'product', 'theme', 'kpi_audience', 'creative_versions',
'price', 'price_placement', 'offer_placement', 'video_type', 'length']

ml_data_2[cat_cols] = ml_data_2[cat_cols].apply(lambda x: x.astype('category'))
ml_data_2[cat_cols] = ml_data_2[cat_cols].apply(lambda x: x.cat.codes.astype('category').astype('category'))

In [108]:
upper_funnel_2 = ml_data_2[ml_data_2.funnel == 2]
middle_funnel_2 = ml_data_2[ml_data_2.funnel == 1]
lower_funnel_2 = ml_data_2[ml_data_2.funnel == 0]

In [183]:
#Predict CVRBF
X = middle_funnel_2.iloc[:,:13]
y = middle_funnel_2.iloc[:, 13:]

# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Columns to encode
one_hot = ['channel', 'publisher', 'kpi_audience', 'theme', 'creative_versions']

# Preprocessing
ct = make_column_transformer(
    #(SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True), imp_cols),
    (OrdinalEncoder(), ['funnel']),
    (OneHotEncoder(handle_unknown='ignore'), one_hot),
    remainder='passthrough'
)

# Pipeline
rf = RandomForestRegressor(n_estimators = 200, max_depth = 15, min_samples_leaf=5 , random_state=123)
pipe = make_pipeline(ct, rf)

# Model evaluation
scorers = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']

# Buy Flow CVR
cv = cross_validate(pipe, X_train, y_train['Mean_CVRBF'], cv=5, scoring=scorers, error_score='raise')
r2 = cv.get('test_r2').mean().round(2)
mse = -1*cv.get('test_neg_mean_squared_error').mean().round(4)
mae = -1*cv.get('test_neg_mean_absolute_error').mean().round(4)
print('Mean Buy Flow Entry CVR')
print(f'R2: {r2}\nMSE: {mse}\nMAE: {mae}\n')


pipe.fit(X_train,y_train['Mean_CVRBF'])
y_predict = pipe.predict(X_test)
print('These are predicted values: ', y_predict[0:5])
print('These are the corresponding actual values: ')

print(np.array(y_test['Mean_CVRBF']))

score=r2_score(y_test['Mean_CVRBF'], y_predict)
print("R2 is: ", score)
print("mean squared error is: ", mean_squared_error(y_test['Mean_CVRBF'], y_predict))
print("mean absolute error is: ", mean_absolute_error(y_test['Mean_CVRBF'], y_predict))

Mean Buy Flow Entry CVR
R2: 0.58
MSE: 5.7397
MAE: 1.0067

These are predicted values:  [7.79450833 0.86098837 0.27192205 0.62279061 1.253584  ]
These are the corresponding actual values: 
[7.97187500e+00 3.02916667e-01 0.00000000e+00 7.29791667e-01
 0.00000000e+00 5.17500000e-01 6.92291667e-01 1.19510417e+01
 6.36916667e+00 3.51250000e-01 1.47270833e+00 1.46145833e+00
 8.16458333e-01 3.77500000e-01 8.66875000e-01 2.21666667e+00
 2.73541667e-01 2.59645833e+00 1.56375000e+00 2.28395833e+00
 2.55625000e-01 1.34308333e+01 1.31062500e+00 9.87500000e-01
 2.83750000e-01 1.44687500e+00 3.40125000e+00 6.25000000e-02
 3.61083333e+00 3.57916667e-01 1.38383333e+01 2.37541667e+00
 8.71875000e-01 2.89791667e-01 9.35500000e+00 2.58125000e-01
 0.00000000e+00 3.61875000e+00 4.13958333e-01 8.93395833e+00
 1.36333333e+00 0.00000000e+00 0.00000000e+00 1.24416667e+00
 3.37708333e-01 6.31041667e-01 2.10916667e+00 2.71125000e+00
 7.56666667e+00 2.10354167e+00 4.57708333e-01 1.13520833e+00
 2.76041667e-01 4.8

In [256]:
#This is an LGBM model for predicting Sales CVR, it performs slightly better than RF 
X = lower_funnel_2.iloc[:,:13]
y = lower_funnel_2.iloc[:, 13:]


# Split train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Columns to impute and encode
imp_cols = ['price_placement', 'offer_placement', 'video_type']
one_hot = cat_cols[1:]

# Preprocessing
ct = make_column_transformer(
    (OrdinalEncoder(), ['funnel']),
    (OneHotEncoder(handle_unknown='ignore'), one_hot),
    remainder='passthrough'
)

# Pipeline
lgbm = LGBMRegressor(learning_rate=0.025,  num_iterations= 400, min_child_samples=5, random_state=123)
pipe = make_pipeline(ct, lgbm)

# Model evaluation
mean_cvrs = ['Mean_CVRS']
scorers = ['r2', 'neg_mean_squared_error', 'neg_mean_absolute_error']

for mean_kpi in mean_cvrs:
    cv = cross_validate(pipe, X_train, y_train[mean_kpi], cv=5, scoring=scorers, error_score='raise')
    r2 = cv.get('test_r2').mean().round(2)
    mse = -1*cv.get('test_neg_mean_squared_error').mean().round(4)
    mae = -1*cv.get('test_neg_mean_absolute_error').mean().round(4)
    print(mean_kpi)
    print(f'R2: {r2}\nMSE: {mse}\nMAE: {mae}\n')
    



pipe.fit(X_train,y_train['Mean_CVRS'])
y_predict = pipe.predict(X_test)
print('These are predicted values: ', y_predict[0:5])

print('These are the corresponding actual values: ')
y_test1 = y_test['Mean_CVRS']
print(np.array(y_test1[0:5]))

score=r2_score(y_test['Mean_CVRS'], y_predict)
print("R2 is: ", score)
print("mean squared error is: ", mean_squared_error(y_test['Mean_CVRS'], y_predict))
print("mean absolute error is: ", mean_absolute_error(y_test['Mean_CVRS'], y_predict))

Mean_CVRS
R2: 0.42
MSE: 0.3407
MAE: 0.2713

These are predicted values:  [0.20739821 0.03450815 0.1596799  0.13948816 0.05511455]
These are the corresponding actual values: 
[0.038125   0.02458333 0.12229167 0.15541667 0.00416667]
R2 is:  0.37236223705549976
mean squared error is:  0.46686525063602835
mean absolute error is:  0.30209023415710035
