# Analysis of the influence of each media channel on the organic consumers using Regression Analysis and Synthetic AB testing
## Two media channels just appeared and there is not enough data to analyse their influence using regression analysis. 
## Therefore, the following algorithm is proposed for the analysis:
### 1. Regression analysis on the period where media_channel_4 and media_channel_5 were not launched.
### 2. Influence analysis of media_channel_4 on period where media_channel_5 were not launched. Using regression created before the prediction of organic consumers was estimated and the difference in means is assessed (AB test). Average residuals are estimated to test the hypothesis that the media_channel_4 influences organic consumers. The effect of media_channel_4 is calculated by dividing the average residuals of organic consumers by the average media_channel_4 spend.
### 3. On the last period (where media_channel_4 and media_channel_5 were launched) the influence of media_channel_5 is analyzed. The prediction of organic consumers for this period is performed using regression created and the effect of media_channel_4 (prediction = regression_prediction + media_channel_4 spend * media_channel_4 effect). Based on the obtained forecast for this period, the effect of the media channel is calculated by dividing the average residuals of organic consumers by the average media_channel_5 spend.

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [None]:
from scipy import stats
import scipy.stats as stats
from scipy.stats import uniform, binom, norm, kstest, shapiro 

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.diagnostic as smd
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_white
from statsmodels.stats.diagnostic import het_goldfeldquandt
from statsmodels.stats.diagnostic import acorr_breusch_godfrey

In [None]:
df = pd.read_excel(f'Data_frame.xlsx')

# EDA (Exploratary Data Analysis)

In [None]:
df.describe()

## Correlation between variables

In [None]:
corr = df.drop('organics', axis=1).corr()

In [None]:
plt.figure(figsize=(8, 6))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 6}, square=True);

## Data distribution

In [None]:
# data distibution for detecting outliers
for i in range(0, len(df.columns), 5): 	
        sns.pairplot(data=df,  				
		x_vars=df.columns[i:i+5], 
		y_vars=['organics'])

In [None]:
# deleting outliers
df = df.drop(index = [15, 17]).reset_index(drop = True)

In [None]:
# data distibution after deleting outliers
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['organics'])

## Making square and log vaiables

In [None]:
# if there is a nonlinear relationship then 
df['media_channel_1_sqr'] = df.media_channel_1**2
df['media_channel_2_sqr'] = df.media_channel_2**2
df['media_channel_3_sqr'] = df.media_channel_3**2

df['media_channel_1_log'] = np.log(1 + df.media_channel_1)
df['media_channel_2_log'] = np.log(1 + df.media_channel_2)
df['media_channel_3_log'] = np.log(1 + df.media_channel_3)

In [None]:
df.describe()

# First model 

## 1 period
### Regression analysis on the period where media_channel_4 and media_channel_5 were not launched.

In [None]:
# creation of dataframes 
# period where media_channel_4 and media_channel_5 were not launched
period_nothing = 202402
df_nothing = df.query(f"yyyymm < {period_nothing}")
df_prediction = df.query(f"yyyymm >= {period_nothing}")

In [None]:
# regression
result = smf.ols(formula = 'organics ~ media_channel_1 + media_channel_2 + media_channel_3', data = df_nothing).fit(cov_type = 'HC0')

# prediction
predictions = result.predict(df_prediction[['media_channel_1', 'media_channel_2', 'media_channel_3']])
df_prediction['organics_predict'] = predictions
df_prediction.loc[:,['organics','organics_predict']].plot()

In [None]:
result.summary()

In [None]:
# Hypothesis about linear restictions
variables = ['media_channel_1', 'media_channel_2', 'media_channel_3']
for i in variables:
    wald_test = result.wald_test(f'({i} = 0)')
    if wald_test.pvalue < 0.05:
        print(f'{i} != 0: {wald_test.pvalue}')
    else:
        print(f'{i} = 0: {wald_test.pvalue}')

In [None]:
# Hypothesis about the equality of variables
# p-value < 0.5: the compared variables have significantly different effects
t_test = result.t_test("media_channel_1 = media_channel_2")
print('media_channel_1 = media_channel_2', t_test)

t_test = result.t_test("media_channel_1 = media_channel_3")
print('media_channel_1 = media_channel_3', t_test)

t_test = result.t_test("media_channel_2 = media_channel_3")
print('media_channel_2 = media_channel_3', t_test)

In [None]:
# Hypothesis about missing variables
resettest = smd.linear_reset(res = result, power = 2, test_type = 'fitted', use_f = True)
if resettest.pvalue < 0.05:
    print(f'result has missing values: {resettest.pvalue}')
else:
    print(f'result has no missing values: {resettest.pvalue}')

In [None]:
df_prediction['yyyymm'] = df_prediction['yyyymm'].astype(str)
df_prediction.plot(x='yyyymm', y=['organics', 'organics_predict'], figsize=(45, 10), ax=plt.gca())

plt.xticks(ticks=range(len(df_prediction['yyyymm'])), labels=df_prediction['yyyymm'], rotation=45)
plt.grid(alpha=0.3)

plt.savefig(f'data_prediction.pdf', dpi=300)

# 2 period
### Influence analysis of media_channel_4 on period where media_channel_5 were not launched. Using regression created before the prediction of organic consumers was estimated and the difference in means is assessed (AB test). Average residuals are estimated to test the hypothesis that the media_channel_4 influences organic consumers. The effect of media_channel_4 is calculated by dividing the average residuals of organic consumers by the average media_channel_4 spend.

In [None]:
# predicting organic consumers on the 2nd period
period_1 = 202410
df_prediction['yyyymm'] = df_prediction['yyyymm'].astype(int)
df_2_period = df_prediction.query(f"yyyymm  >= {period_nothing} and yyyymm < {period_1}")

# calculating the effect of the media_channel_4
df_2_period['residuals'] = df_2_period.organics - df_2_period.organics_predict
period2_effect = df_2_period.residuals.mean()/df_2_period.media_channel_4_spend.mean()

print(f'media_channel_4: {period2_effect}')

In [None]:
df_2_period.residuals

In [None]:
# Bootstrapping to check the hypothesis that media_channel_4 affects organic consumers
n_iterations = 100 # Number of bootstrap samples
sample_means = []  # List to store the mean of each bootstrap sample
np.random.seed(42)
for i in range(n_iterations):
    sample = np.random.choice(df_2_period.residuals, size=len(df_2_period.residuals), replace=True)
    sample_means.append(np.mean(sample))

In [None]:
sample_m = pd.DataFrame(sample_means)
sample_m.describe()

In [None]:
# Quantile values
Q1 = np.percentile(sample_means, 2.5, interpolation = 'midpoint')
Q2 = np.percentile(sample_means, 50, interpolation = 'midpoint')
Q3 = np.percentile(sample_means, 97.5, interpolation = 'midpoint')

print(f" Q1 = {Q1},  Q2 = {Q2},  Q3 = {Q3}")

In [None]:
# Histogram of residuals
plt.figure(figsize=(10, 6))
sns.histplot(sample_means, kde=True)
plt.title('Distribution of Bootstrap Sample Mean')
plt.xlabel("Mean of residuals")
plt.ylabel('Frequency')
plt.show()

![My Image](images/Residuals.png)

In [None]:
# Visualizing the distribution of means from bootstrap samples
plt.figure(figsize=(10, 6))
plt.hist(sample_means, edgecolor='black', alpha=0.7)
plt.title('Distribution of Bootstrap Sample Means')
plt.xlabel("Mean of residuals")
plt.ylabel('Frequency')
plt.axvline(Q1, color='r', linestyle='dashed', linewidth=2, label=f"2,5% квантиль = {Q1:.2f}")
plt.axvline(Q2, color='g', linestyle='dashed', linewidth=2, label=f"50% квантиль = {Q2:.2f}")
plt.axvline(Q3, color='r', linestyle='dashed', linewidth=2, label=f"97,5% квантиль = {Q3:.2f}")
plt.grid(True)
plt.legend()
plt.show()

![My Image](images/Residuals_quantiles.png)

In [None]:
# Determining the normality of the distribution 

# Perform Kolmogorov-Smirnov test
ks_test = stats.kstest(sample_means, 'norm')
print(f'Kolmogorov-Smirnov Test: Statistic={ks_test.statistic}, p-value={ks_test.pvalue}')

# Perform Shapiro-Wilk test
shapiro_test = stats.shapiro(sample_means)
print(f'Shapiro-Wilk Test: Statistic={shapiro_test.statistic}, p-value={shapiro_test.pvalue}')

In [None]:
# getting reisduals taking into account the effect of media_channel_3
df_2_period['residuals_media_channel_4'] = df_2_period.organics - (df_2_period.organics_predict + (df_2_period.media_channel_4_spend*period2_effect))

# 3 period
### On the last period (where media_channel_4 and media_channel_5 were launched) the influence of media_channel_5 is analyzed. The prediction of organic consumers for this period is performed using regression created and the effect of media_channel_4 (prediction = regression_prediction + media_channel_4 spend * media_channel_4 effect). Based on the obtained forecast for this period, the effect of the media channel is calculated by dividing the average residuals of organic consumers by the average media_channel_5 spend.

In [None]:
# predicting organic consumers on 3rd period
df_both = df_prediction.query(f"yyyymm >= {period_1}")
df_both['new_organics_predict'] = df_both.organics_predict + (df_both.media_channel_4_spend * period2_effect)
df_both['residuals'] = df_both.organics - df_both.new_organics_predict

# calculating the effect of the media_channel_5
period3_effect = df_both.residuals.mean()/df_both.media_channel_5_spend.mean()
print(f'media_channel_5: {period3_effect}')

# all periods

In [None]:
# getting the dataframe where prediction for all periods is made
df_nothing['organics_predict'] = result.predict(df_nothing[['media_channel_1', 'media_channel_2', 'media_channel_3']])
df_prediction = pd.concat([df_nothing, df_prediction])

# the prediction of organic consumers is based on the effects of media channels 3 and 4.
df_prediction['organics_predict_new'] = df_prediction.organics_predict + (df_prediction.media_channel_4_spend*period2_effect) + (df_prediction.media_channel_5_spend*period3_effect)

In [None]:
# The display of real values, predicted values, and predicted values with the influence of media_channel_3 and media_channel_4
df_prediction.loc[:,['organics','organics_predict', 'organics_predict_new']].plot()

![My Image](images/Prediction_graph.png)

In [None]:
df_prediction['yyyymm'] = df_prediction['yyyymm'].astype(str)
df_prediction.plot(x='yyyymm', y=['organics', 'organics_predict', 'organics_predict_new'], figsize=(45, 10), ax=plt.gca())

plt.xticks(ticks=range(len(df['yyyymm'])), labels=df_prediction['yyyymm'], rotation=45)
plt.grid(alpha=0.3)

plt.savefig(f'data_prediction_all.pdf', dpi=300)

# Regression Diagnostics and Hypothesis Testing

## Robust errors

In [None]:
# we use robust errors before testing heteroskedasticity to get right st err of our model
robust = result.get_robustcov_results(cov_type='HC0')

print("Coefficients with Robust Standard Errors:")
summary_df = pd.DataFrame({
    'Coefficient': robust.params,
    'Robust Std Err': robust.bse,
    't-value': robust.tvalues,
    'p-value': robust.pvalues
})
print(summary_df)

In [None]:
# results of a model with robust erros
result_HC = smf.ols(formula = 
'organics ~ media_channel_1 + media_channel_2 + media_channel_3 ', data = df).fit(cov_type = 'HC0')
result_HC.summary()

## Hypothesis about linear restictions

In [None]:
variables = ['media_channel_1',  'media_channel_2',  'media_channel_3']
for i in variables:
    wald_test = result.wald_test(f'({i} = 0)')
    if wald_test.pvalue < 0.05:
        print(f'{i} != 0: {wald_test.pvalue}')
    else:
        print(f'{i} = 0: {wald_test.pvalue}')

## Hypothesis about the equality of variables

In [None]:
# p-value < 0.5: the compared variables have significantly different effects
t_test = result.t_test("media_channel_1 = media_channel_2")
print('media_channel_1 = media_channel_2', t_test)

t_test = result.t_test("media_channel_1 = media_channel_3")
print('media_channel_1 = media_channel_3', t_test)

t_test = result.t_test("media_channel_2 = media_channel_3")
print('media_channel_2 = media_channel_3', t_test)

## Hypothesis about missing variables

In [None]:
result.summary()

In [None]:
resettest = smd.linear_reset(res = result, power = 2, test_type = 'fitted', use_f = True)
if resettest.pvalue < 0.05:
    print(f'result has missing values: {resettest.pvalue}')
else:
    print(f'result has no missing values: {resettest.pvalue}')

## Multicollinearity

In [None]:
variables = ['media_channel_1', 'media_channel_2', 'media_channel_3', 'media_channel_4', 'media_channel_5']

vif_data = pd.DataFrame()
X = df[variables]
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data)

## Heteroskedasticity

In [None]:
# White test
# H0: Homoscedasticity is present.
white_test = het_white(result.resid, result.model.exog)

lm_stat, lm_pvalue, f_stat, f_pvalue = white_test

print("White test results:")
print(f"Lagrange Multiplier Statistic: {lm_stat:.4f}")
print(f"P-value (LM test): {lm_pvalue:.4f}")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value (F test): {f_pvalue:.4f}")

In [None]:
# Goldfeld-Quandt test
# is better as we have few data and errors are normally distributed
# H0: Homoscedasticity is present.
test = het_goldfeldquandt(result.resid, result.model.exog)
test_stat, p_value, alterbative = test
print("Goldfeld-Quandt test results:")
print(f"Test statistic: {test_stat:.4f}")
print(f"P-value: {p_value:.4f}")

## Autocorrelation

In [None]:
# making graph to see if there is autocorrelation
plt.scatter(result.resid.shift(1),result.resid)
plt.xlabel('error before')
plt.ylabel('error now')
plt.grid(True)

In [None]:
# making graph to see if there is autocorrelation
plt.scatter(result.resid.shift(2),result.resid)
plt.xlabel('error before 2')
plt.ylabel('error now')
plt.grid(True)

In [None]:
# vcov matrix taking into account autocorrelation.
hac_cov_matrix = result.get_robustcov_results(cov_type='HAC', maxlags=1).cov_params()
print(hac_cov_matrix)

In [None]:
# results of a model with se_HAC
result_HAC = smf.ols(formula = 
'organics ~ media_channel_1 + media_channel_2 + media_channel_3', data = df).fit(cov_type = 'HAC', cov_kwds = {'maxlags':1})
result_HAC.summary()

In [None]:
# Durbin-Watson test
# H0: no autocorrelation
# H1: autocorrelation p=1
dw_statistic = durbin_watson(result.resid)
print(f'Durbin-Watson statistic: {dw_statistic}')

In [None]:
# Breush-Godfrey test
# H0: no autocorrelation
# H1: autocorrelation p-n
bg_test = acorr_breusch_godfrey(result, nlags=2)  # Testing for 2 lags
print(f'Breusch-Godfrey test statistic: {bg_test[0]}')
print(f'p-value: {bg_test[1]}')

## Errors

In [None]:
# Get the residuals
residuals = result.resid

# Histogram of residuals
plt.figure(figsize=(10, 6))
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# QQ plot
sm.qqplot(residuals, line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()

## Testing for normality of the distribution

In [None]:
# Perform Kolmogorov-Smirnov test
ks_test = stats.kstest(residuals, 'norm')
print(f'Kolmogorov-Smirnov Test: Statistic={ks_test.statistic}, p-value={ks_test.pvalue}')

# Perform Shapiro-Wilk test
shapiro_test = stats.shapiro(residuals)
print(f'Shapiro-Wilk Test: Statistic={shapiro_test.statistic}, p-value={shapiro_test.pvalue}')