In [1]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm

In [2]:
data = pd.read_csv('website_ab_test.csv')
data.head()

Unnamed: 0,Theme,Click Through Rate,Conversion Rate,Bounce Rate,Scroll_Depth,Age,Location,Session_Duration,Purchases,Added_to_Cart
0,Light Theme,0.05492,0.282367,0.405085,72.489458,25,Chennai,1535,No,Yes
1,Light Theme,0.113932,0.032973,0.732759,61.858568,19,Pune,303,No,Yes
2,Dark Theme,0.323352,0.178763,0.296543,45.737376,47,Chennai,563,Yes,Yes
3,Light Theme,0.485836,0.325225,0.245001,76.305298,58,Pune,385,Yes,No
4,Light Theme,0.034783,0.196766,0.7651,48.927407,25,New Delhi,1437,No,No


In [6]:
X = data[['Click Through Rate','Conversion Rate', 'Bounce Rate', 'Scroll_Depth', 'Age',
         'Session_Duration']]
y = pd.to_numeric(data['Purchases'], errors='coerce')

x = sm.add_constant(X)
model = sm.OLS(y, x).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:12:03   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     993   BIC:                               nan
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                     nan       

In [7]:
data.isnull().sum()

Theme                 0
Click Through Rate    0
Conversion Rate       0
Bounce Rate           0
Scroll_Depth          0
Age                   0
Location              0
Session_Duration      0
Purchases             0
Added_to_Cart         0
dtype: int64

In [8]:
corr_matrix = X.corr()
corr_matrix

Unnamed: 0,Click Through Rate,Conversion Rate,Bounce Rate,Scroll_Depth,Age,Session_Duration
Click Through Rate,1.0,-0.03996,0.038327,0.00997,-0.001747,-0.010691
Conversion Rate,-0.03996,1.0,-0.045985,-0.002275,0.016712,0.008822
Bounce Rate,0.038327,-0.045985,1.0,0.002801,0.052069,-0.009579
Scroll_Depth,0.00997,-0.002275,0.002801,1.0,0.00793,0.017199
Age,-0.001747,0.016712,0.052069,0.00793,1.0,0.013852
Session_Duration,-0.010691,0.008822,-0.009579,0.017199,0.013852,1.0


In [11]:
vif = pd.DataFrame()
vif['VIF Factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns
vif

Unnamed: 0,VIF Factor,features
0,4.002019,Click Through Rate
1,3.872932,Conversion Rate
2,7.562913,Bounce Rate
3,7.571454,Scroll_Depth
4,7.717645,Age
5,3.947746,Session_Duration


The VIF values suggest that there might be some multicollinearity present in the input data, especially for the `Bounce Rate`, `Scroll_Depth`, and `Age` variables, which have VIF values greater than 5. 

To address this issue, you can try removing one or more of these variables from the model and see if it improves the performance. Alternatively, you can try using regularization techniques such as Ridge or Lasso regression to reduce the impact of multicollinearity on the model.


This code removes the `Bounce Rate` variable from the input data, adds a constant to the independent variables, fits the model, and prints the summary of the model. You can modify the input data and model as necessary to suit your specific use case.

In [10]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [12]:
# Convert the data types of the columns in X to numeric
X = data[['Click Through Rate', 'Conversion Rate', 'Scroll_Depth', 'Age', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:29:55   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     994   BIC:                               nan
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                     nan       

In [13]:
# Convert the data types of the columns in X to numeric
X = data[['Click Through Rate', 'Conversion Rate', 'Scroll_Depth', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:42:05   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     995   BIC:                               nan
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                     nan       

In [14]:
# Convert the data types of the columns in X to numeric
X = data[['Conversion Rate', 'Scroll_Depth', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:43:13   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     996   BIC:                               nan
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                   nan        nan  

In [15]:
# Convert the data types of the columns in X to numeric
X = data[['Conversion Rate', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:44:07   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     997   BIC:                               nan
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                   nan        nan  

In [16]:
# Convert the data types of the columns in X to numeric
X = data[['Conversion Rate', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Purchases   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Sun, 13 Aug 2023   Prob (F-statistic):                nan
Time:                        21:45:04   Log-Likelihood:                    nan
No. Observations:                1000   AIC:                               nan
Df Residuals:                     997   BIC:                               nan
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                   nan        nan  

In [17]:
data.dtypes

Theme                  object
Click Through Rate    float64
Conversion Rate       float64
Bounce Rate           float64
Scroll_Depth          float64
Age                     int64
Location               object
Session_Duration        int64
Purchases              object
Added_to_Cart          object
dtype: object

In [18]:
# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
data['Purchases'] = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
data = data.dropna()

# Define the independent and dependent variables
X = data[['Click Through Rate', 'Conversion Rate', 'Bounce Rate', 'Scroll_Depth', 'Age', 'Session_Duration']]
y = data['Purchases']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

ValueError: zero-size array to reduction operation maximum which has no identity

In [19]:
# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
data['Purchases'] = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
data = data.dropna()

# Define the independent and dependent variables
X = data[['Click Through Rate', 'Conversion Rate', 'Bounce Rate', 'Scroll_Depth', 'Age', 'Session_Duration']]
y = data['Purchases']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

ValueError: zero-size array to reduction operation maximum which has no identity

In [20]:
# Convert the data types of the columns in X to numeric
X = data[['Conversion Rate', 'Session_Duration']].apply(pd.to_numeric)

# Convert the data type of the 'Purchases' column to numeric with NaN values for non-numeric values
y = pd.to_numeric(data['Purchases'], errors='coerce')

# Remove any rows with missing or invalid values
X = X.dropna()
y = y[X.index]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

ValueError: zero-size array to reduction operation maximum which has no identity