In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

df = pd.read_excel('ads_dim3.xlsx')
columns_to_regress= ['ad_id', 'log_punish', 'log_ad_revenue', 'log_avg_ad_revenue', 'log_baseline_st', 'days_since_punish', 'ad_run_time']
df = df[columns_to_regress]
df.to_excel('TrainingData.xlsx', index = None)

In [None]:
# using
# - log_ad_rev
# - log_avg_ad_rev
# - log_baseline_st
# - log_punish
# - ad_run_time

trainingData = pd.read_excel('TrainingData.xlsx')
df_lr = trainingData.copy()
df_gb = trainingData.copy()

inf_count_lr = np.isinf(df_lr).sum()
inf_count_gb = np.isinf(df_gb).sum()

# Replace infinities with NaN
df_lr = df_lr.replace([np.inf, -np.inf], np.nan)
df_gb = df_gb.replace([np.inf, -np.inf], np.nan)

# Drop rows with NaN values
df_lr = df_lr.dropna()
df_gb = df_gb.dropna()

ad_id_lr = df_lr['ad_id']
df_lr.drop(['ad_id'], axis=1, inplace=True)

ad_id_gb = df_gb['ad_id']
df_gb.drop(['ad_id'], axis=1, inplace=True)

In [None]:
# use linear regression
coefs = {var: [] for var in df_lr.columns}
intercepts = []

for column in df_lr.columns:
    X = df_lr.drop(column, axis=1)
    y = df_lr[column]
    reg = LinearRegression().fit(X, y)
    print(f"Dependent variable: {column}")
    print("Coefficients:")
    for var, coef in zip(X.columns, reg.coef_):
        print(f"{var}: {coef}")
        coefs[var].append(coef)
    print(f"Intercept: {reg.intercept_}")
    intercepts.append(reg.intercept_)
    print('\n')

# Calculate the average coefficient for each independent variable
avg_coefs = {var: sum(coefs[var])/len(coefs[var]) for var in coefs}
# Calculate the average intercept
avg_intercept = sum(intercepts)/len(intercepts)
print("Average coefficients:")
for var, avg_coef in avg_coefs.items():
    print(f"{var}: {avg_coef}")
print(f"Average intercept: {avg_intercept}")

# linear regression Average coefficients:
# log_punish: 7.18014209372482
# log_ad_revenue: -0.06443096647832146
# log_avg_ad_revenue: 1.5943043804097148
# log_baseline_st: -4.4893339410854285
# days_since_punish: 0.017953594309575514
# ad_run_time: 0.015548906340517537
# Average intercept: 30.71252983647334

In [None]:
# Average coefficients and intercept for linear regression
avg_coefs = {
    'log_punish': 7.18014209372482,
    'log_ad_revenue': -0.06443096647832146,
    'log_avg_ad_revenue': 1.5943043804097148,
    'log_baseline_st': -4.4893339410854285,
    'days_since_punish': 0.017953594309575514,
    'ad_run_time': 0.015548906340517537
}
avg_intercept = 30.71252983647334

# Define the independent variables
X = df_lr[list(avg_coefs.keys())]

# Calculate the weighted average of the independent variables
y_pred = X.dot(np.array(list(avg_coefs.values()))) + avg_intercept

# Add the ad_id column and the predicted values for Y back to the DataFrame
df_lr['ad_id'] = ad_id_lr
df_lr['y_pred'] = y_pred
df_lr.to_excel('df_lr.xlsx',index=None)

In [None]:
# use gradient boosting regression
from sklearn.ensemble import GradientBoostingRegressor
# Initialize a dictionary to store the feature importances for each independent variable
importances = {var: [] for var in df_gb.columns}
# Initialize a list to store the intercepts for each dependent variable
intercepts = []

for column in df_gb.columns:
    X = df_gb.drop(column, axis=1)
    y = df_gb[column]
    reg = GradientBoostingRegressor().fit(X, y)
    print(f"Dependent variable: {column}")
    print("Feature importances:")
    for var, imp in zip(X.columns, reg.feature_importances_):
        print(f"{var}: {imp}")
        importances[var].append(imp)
    intercepts.append(reg.init_.constant_[0][0])
    print('\n')

# Calculate the average feature importance for each independent variable
avg_importances = {var: sum(importances[var])/len(importances[var]) for var in importances}
# Calculate the average intercept
avg_intercept = sum(intercepts)/len(intercepts)

print("Average feature importances:")
for var, avg_imp in avg_importances.items():
    print(f"{var}: {avg_imp}")
print(f"Average intercept: {avg_intercept}")

# GradientBoostingRegressor Average feature importances:
# log_punish: 0.06812327966619008
# log_ad_revenue: 0.21249643834898407
# log_avg_ad_revenue: 0.3230820880240495
# log_baseline_st: 0.15432104027909502
# days_since_punish: 0.10371900675908725
# ad_run_time: 0.33825814692259415
# Average intercept: 4.2656448978951085

In [None]:
# Gradient boosting average feature importances and intercept

avg_importances = {
    'log_punish': 0.06812327966619008,
    'log_ad_revenue': 0.21249643834898407,
    'log_avg_ad_revenue': 0.3230820880240495,
    'log_baseline_st': 0.15432104027909502,
    'days_since_punish': 0.10371900675908725,
    'ad_run_time': 0.33825814692259415
}
avg_intercept = 4.2656448978951085
# Define the independent variables
X = df_gb[list(avg_importances.keys())]
# Calculate the weighted average of the independent variables
y_pred = X.dot(np.array(list(avg_importances.values()))) + avg_intercept

df_gb['ad_id'] = ad_id_gb
df_gb['y_pred'] = y_pred
df_gb.to_excel('df_gb.xlsx',index=None)
