# Regression Notebook

Regression modle with all data to estimating purchase value ('total_revenue').

#### Importing all the libraries needed for the workbook

In [2]:
# Main Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score, r2_score
import statsmodels.api as sm



#### Providing a table of contents for the definitions of each feature in the dataset

In [3]:
# Define the data for the DataFrame
data = {
    "Variable": [
        "user_pseudo_id", "country", "city", "device_category", "operating_system",
        "sourcemedium", "new_user", "first_event", "last_event", "event_count",
        "items_viewed", "add_to_carts", "checkouts", "purchases", "total_revenue",
        "quantity", "sessions", "coupons", "click_jetzt_einkaufen", "Click_gewinne",
        "click_primary_header_wrapper", "click_noch_primary", "click_link_noch",
        "click_dreh", "click_jetzt_spielen", "click_gluecksrad_drehe", "click_code_anzeigen"
    ],
    "Type": [
        "Float64", "Object", "Object", "Object", "Object", "Object", "Int64", 
        "datetime64[ns]", "datetime64[ns]", "Int64", "Int64", "Int64", "Int64", 
        "Int64", "Float64", "Int64", "Int64", "Object", "Int64", "Int64", 
        "Int64", "Int64", "Int64", "Int64", "Int64", "Int64", "Int64"
    ],
    "Definition": [
        "A numerical (float) identifier for users that has been anonymized.",
        "Categorical variable indicating the user's country.",
        "Categorical variable indicating the user's city.",
        "Categorical variable indicating the type of device (e.g., mobile, desktop).",
        "Categorical variable indicating the operating system of the user's device.",
        "Categorical variable indicating the source/medium through which the user accessed the site/app.",
        "Binary indicator whether the user is new (1) or returning (0).",
        "Datetime variable indicating the time of the first event recorded.",
        "Datetime variable indicating the time of the last event recorded.",
        "Numerical variable indicating the total number of events recorded.",
        "Numerical variable indicating the number of items viewed.",
        "Numerical variable indicating how many times items were added to the cart.",
        "Numerical variable indicating how many times a checkout process was initiated.",
        "Numerical variable indicating the number of purchases made.",
        "Numerical (float) indicating the total revenue generated from the user.",
        "Numerical variable indicating the quantity of items involved in transactions.",
        "Numerical variable indicating the number of sessions.",
        "Categorical variable indicating the use of coupons.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website.",
        "Numerical variable attributed to a specific click event recorded on the website."
    ]
}

# Create the DataFrame
variable_info_df = pd.DataFrame(data)

# Set display options
pd.set_option('display.max_colwidth', None)  # or use a large number like 1000 instead of None for older Pandas versions
pd.set_option('display.max_columns', None)  # Ensures all columns are displayed

# Display the DataFrame to verify
variable_info_df

Unnamed: 0,Variable,Type,Definition
0,user_pseudo_id,Float64,A numerical (float) identifier for users that has been anonymized.
1,country,Object,Categorical variable indicating the user's country.
2,city,Object,Categorical variable indicating the user's city.
3,device_category,Object,"Categorical variable indicating the type of device (e.g., mobile, desktop)."
4,operating_system,Object,Categorical variable indicating the operating system of the user's device.
5,sourcemedium,Object,Categorical variable indicating the source/medium through which the user accessed the site/app.
6,new_user,Int64,Binary indicator whether the user is new (1) or returning (0).
7,first_event,datetime64[ns],Datetime variable indicating the time of the first event recorded.
8,last_event,datetime64[ns],Datetime variable indicating the time of the last event recorded.
9,event_count,Int64,Numerical variable indicating the total number of events recorded.


#### Manipulating and cleaning the dataset to be used for classification modeling. 

Here we will import the 2 CSV files that contain the data. I create new columns for the 2 types of gamification options that were run, 'gluecksrad_engagement' & 'wbyo_engagement', and then merget the datasets where then add the overall engagement of gamification 'gamification_engagement'.

In [4]:
#importing data wheel
file_path1 = 'nov24.csv'
wheel = pd.read_csv(file_path1)
#importing data wheel
file_path2 = 'nov17.csv'
wbyo = pd.read_csv(file_path2)

In [5]:
#adding the date each promotion was run.
wheel['date'] = pd.to_datetime('2023-11-24')
wbyo['date'] = pd.to_datetime('2023-11-17')

# Specify the columns to check for the condition on the wheel
columns_to_check1 = ['click_noch_primary', 'click_dreh', 'click_jetzt_spielen', 'click_gluecksrad_drehe', 'click_code_anzeigen']

# Use np.where to create the new column based on the condition on the wheel
wheel['gluecksrad_engagement'] = np.where(wheel[columns_to_check1].gt(0).any(axis=1), 1, 0)

# Specify the columns to check for the condition on the wbyo
columns_to_check2 = ['click_noch_primary', 'click_jetzt_einkaufen', 'click_gewinne', 'click_link_noch']

# Use np.where to create the new column based on the condition on the wbyo
wbyo['wbyo_engagement'] = np.where(wbyo[columns_to_check2].gt(0).any(axis=1), 1, 0)

#adding 0 to each dataframe for the new engagement columns so that the 2 dataframes can be merged.
wheel['wbyo_engagement'] = 0
wbyo['gluecksrad_engagement'] = 0

# Concatenate the DataFrames by adding the rows of 'wheel' to 'wbyo'
df = pd.concat([wheel, wbyo], ignore_index=True)

# Specify the columns to check for the condition on the wbyo
columns_to_check3 = ['wbyo_engagement', 'gluecksrad_engagement']

# Use np.where to create the new column based on the condition on the wbyo
df['gamification_engagement'] = np.where(df[columns_to_check3].gt(0).any(axis=1), 1, 0)

df

Unnamed: 0,user_pseudo_id,country,city,device_category,operating_system,sourcemedium,new_user,first_event,last_event,event_count,items_viewed,add_to_carts,checkouts,purchases,total_revenue,quantity,sessions,coupons,click_jetzt_einkaufen,click_gewinne,click_primary_header_wrapper,click_noch_primary,click_link_noch,click_dreh,click_jetzt_spielen,click_gluecksrad_drehe,click_code_anzeigen,date,gluecksrad_engagement,wbyo_engagement,gamification_engagement
0,1.653411e+09,Germany,Berlin,mobile,iOS,TBD,1,1700842391176249,1700842456749569,16,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0
1,1.147704e+09,Germany,Mindelheim,mobile,iOS,TBD,1,1700848653518514,1700848692399166,19,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0
2,1.582437e+09,Switzerland,Bulle,mobile,iOS,TBD,1,1700852395847646,1700852628033297,16,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0
3,1.800091e+09,Germany,Frankfurt,mobile,iOS,TBD,1,1700828032485232,1700828308513799,65,1,1,1,1,46.56,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0
4,3.656791e+08,Germany,,mobile,iOS,TBD,1,1700823107022159,1700823183682287,9,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195586,2.082985e+09,Germany,Cuxhaven,mobile,Android,TBD,1,1700220385242808,1700220398841543,6,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0
195587,2.129320e+09,Germany,Bielefeld,mobile,iOS,TBD,1,1700222757561626,1700222798355484,13,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0
195588,2.130216e+09,Germany,Hildesheim,mobile,iOS,TBD,1,1700220383090692,1700220414150397,12,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0
195589,2.131727e+09,Germany,Siegen,mobile,Android,TBD,1,1700220929845757,1700220929845757,3,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0


In [6]:
# Count the number of rows where 'wbyo_engagement' is 1
num_rows_value_1_wbyo_df = (df['wbyo_engagement'] == 1).sum()

print(f"Number of rows where 'wbyo_engagement' is 1 in df: {num_rows_value_1_wbyo_df}")

# Count the number of rows where 'gluecksrad_engagement' is 1
num_rows_value_1_wheel_df = (df['gluecksrad_engagement'] == 1).sum()

print(f"Number of rows where 'gluecksrad_engagement' is 1 in df: {num_rows_value_1_wheel_df}")

# Count the number of rows where 'gamification_engagement' is 1
num_rows_value_1_gamification_df = (df['gamification_engagement'] == 1).sum()

print(f"Number of rows where 'gamification_engagement' is 1 in df: {num_rows_value_1_gamification_df}")

Number of rows where 'wbyo_engagement' is 1 in df: 1826
Number of rows where 'gluecksrad_engagement' is 1 in df: 22056
Number of rows where 'gamification_engagement' is 1 in df: 23882


In [7]:
#rounding off decimal place to 2 decimals for code like "describe", ect
pd.set_option('display.float_format', lambda x: '%.2f' % x,)

Below I take the first and last event to be able to find the amount of time spent on the website.

In [8]:
# Convert 'first_event' and 'last_event' from Unix timestamps in microseconds to datetime
df['first_event'] = pd.to_datetime(df['first_event'], unit='us')
df['last_event'] = pd.to_datetime(df['last_event'], unit='us')

# Subtract 'first_event' from 'last_event' to get the time difference
df['time_diff'] = (df['last_event'] - df['first_event'])

# Convert the time difference to total seconds, creating a numerical column
df['time_spent_seconds'] = df['time_diff'].dt.total_seconds()

df

Unnamed: 0,user_pseudo_id,country,city,device_category,operating_system,sourcemedium,new_user,first_event,last_event,event_count,items_viewed,add_to_carts,checkouts,purchases,total_revenue,quantity,sessions,coupons,click_jetzt_einkaufen,click_gewinne,click_primary_header_wrapper,click_noch_primary,click_link_noch,click_dreh,click_jetzt_spielen,click_gluecksrad_drehe,click_code_anzeigen,date,gluecksrad_engagement,wbyo_engagement,gamification_engagement,time_diff,time_spent_seconds
0,1653410570.17,Germany,Berlin,mobile,iOS,TBD,1,2023-11-24 16:13:11.176249,2023-11-24 16:14:16.749569,16,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0,0 days 00:01:05.573320,65.57
1,1147704469.17,Germany,Mindelheim,mobile,iOS,TBD,1,2023-11-24 17:57:33.518514,2023-11-24 17:58:12.399166,19,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0,0 days 00:00:38.880652,38.88
2,1582437117.17,Switzerland,Bulle,mobile,iOS,TBD,1,2023-11-24 18:59:55.847646,2023-11-24 19:03:48.033297,16,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0,0 days 00:03:52.185651,232.19
3,1800090715.17,Germany,Frankfurt,mobile,iOS,TBD,1,2023-11-24 12:13:52.485232,2023-11-24 12:18:28.513799,65,1,1,1,1,46.56,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0,0 days 00:04:36.028567,276.03
4,365679149.17,Germany,,mobile,iOS,TBD,1,2023-11-24 10:51:47.022159,2023-11-24 10:53:03.682287,9,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-24,0,0,0,0 days 00:01:16.660128,76.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195586,2082984818.17,Germany,Cuxhaven,mobile,Android,TBD,1,2023-11-17 11:26:25.242808,2023-11-17 11:26:38.841543,6,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0,0 days 00:00:13.598735,13.60
195587,2129320152.17,Germany,Bielefeld,mobile,iOS,TBD,1,2023-11-17 12:05:57.561626,2023-11-17 12:06:38.355484,13,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0,0 days 00:00:40.793858,40.79
195588,2130215726.17,Germany,Hildesheim,mobile,iOS,TBD,1,2023-11-17 11:26:23.090692,2023-11-17 11:26:54.150397,12,1,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0,0 days 00:00:31.059705,31.06
195589,2131727157.17,Germany,Siegen,mobile,Android,TBD,1,2023-11-17 11:35:29.845757,2023-11-17 11:35:29.845757,3,0,0,0,0,0.00,TBD,1,TBD,0,0,0,0,0,0,0,0,0,2023-11-17,0,0,0,0 days 00:00:00,0.00


To follow, I then drop the columns that either are no longer applicable as new columns have been created off the raw data, or the features contain no data points and cannot be used in the analysis.

In [9]:
# List of specific columns to drop
columns_to_drop = ['time_diff', 'coupons', 'quantity', 'sourcemedium', 'first_event', 'last_event', 'click_jetzt_einkaufen', 'click_gewinne', 'click_primary_header_wrapper', 'click_noch_primary', 'click_link_noch', 'click_dreh', 'click_jetzt_spielen', 'click_gluecksrad_drehe', 'click_code_anzeigen']

# Dropping the columns from the DataFrame
df = df.drop(columns=columns_to_drop, errors='ignore')  # errors='ignore' to avoid errors if a column is missing

# Display the DataFrame to verify the columns have been dropped
df.info(verbose=True)

df1 = df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195591 entries, 0 to 195590
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   user_pseudo_id           195591 non-null  float64       
 1   country                  195588 non-null  object        
 2   city                     176991 non-null  object        
 3   device_category          195591 non-null  object        
 4   operating_system         195590 non-null  object        
 5   new_user                 195591 non-null  int64         
 6   event_count              195591 non-null  int64         
 7   items_viewed             195591 non-null  int64         
 8   add_to_carts             195591 non-null  int64         
 9   checkouts                195591 non-null  int64         
 10  purchases                195591 non-null  int64         
 11  total_revenue            195591 non-null  float64       
 12  sessions        

This code is to create dummies for the different type of device categories that can then be used for the analysis. Then the original feature is dropped.

In [10]:
# Create dummy variables for 'device_category'
device_category_dummies = pd.get_dummies(df['device_category'], prefix='device_category')

# Drop the original 'device_category' column from 'df'
df = df.drop('device_category', axis=1)

# Concatenate the dummy variables DataFrame with the original 'df' DataFrame
df = pd.concat([df, device_category_dummies], axis=1)

Here we filter out the data to only have data on users that were in Germany since we do not ship product outside of DE. There might be the small number of users that might be outside of DE that are trying to purchase whilst on holiday or traveling, but since the company has multiple domains, there are a lot of users who accendentally arrive on the DE store when they meant to be on the Austrian or Swiss website, and thus eliminating all other countries helps to have more clean data.

Additionally I also created dummy features for the type of operating systems, like done for devices, and then dropped the original feature.

Furthermore I dropped the Country, city and user ID since these features would not be able to be used for the models.

Finally I created a converted the Date Column as there were two types of gamification options, each run on a single day (different dates), thus making it a binary variable and allowing it to be used for the models.

In [11]:
# Filter 'df' to keep only rows where the 'country' column is 'Germany'
#This is becuase purchases can only be made in germany for this specific store.
df = df[df['country'] == 'Germany']

# Create dummy variables for 'operating_system'
operating_system_dummies = pd.get_dummies(df['operating_system'], prefix='operating_system')

# Drop the original 'operating_system' column from 'df'
df = df.drop('operating_system', axis=1)

# Concatenate the dummy variables DataFrame with the original 'df' DataFrame
df = pd.concat([df, operating_system_dummies], axis=1)

#Dropping the Country and City due to country only being germany and city scope being to wide
#user_pseudo_id is removed as it is not needed either for the OLS

# List of specific columns to drop
columns_to_drop = ['country', 'city', 'user_pseudo_id']

# Dropping the columns from the DataFrame
df = df.drop(columns=columns_to_drop, errors='ignore')  # errors='ignore' to avoid errors if a column is missing

# Convert 'date' column to dummy variable: 0 for '2023-11-17' and 1 for '2023-11-24'
df['date'] = df['date'].apply(lambda x: 1 if x == pd.Timestamp('2023-11-24') else 0)

# Display the DataFrame to verify the columns have been dropped
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187930 entries, 0 to 195590
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   new_user                     187930 non-null  int64  
 1   event_count                  187930 non-null  int64  
 2   items_viewed                 187930 non-null  int64  
 3   add_to_carts                 187930 non-null  int64  
 4   checkouts                    187930 non-null  int64  
 5   purchases                    187930 non-null  int64  
 6   total_revenue                187930 non-null  float64
 7   sessions                     187930 non-null  int64  
 8   date                         187930 non-null  int64  
 9   gluecksrad_engagement        187930 non-null  int64  
 10  wbyo_engagement              187930 non-null  int64  
 11  gamification_engagement      187930 non-null  int64  
 12  time_spent_seconds           187930 non-null  float64
 13 

In [12]:
df2 = df

### Building the Reggression Models

#### Data Preprocessing
Assuming that df1 is already loaded and it contains the features and target mentioned:

In [13]:
# Dropping columns that are not useful or cannot be processed (like IDs, dates, or other non-predictive information)
X = df1.drop(columns=['user_pseudo_id', 'date', 'total_revenue'])
y = df1['total_revenue']

# Handling categorical data and missing values
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Create a transformer for numerical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Fill missing values with mean
    ('scaler', StandardScaler())  # Scale data
])

# Create a transformer for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Fill missing categories with 'missing'
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One hot encode categories
])

# Combine transformers into a preprocessor step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


#### Define Models and Pipelines

In [14]:
# Dictionary of regression models
regression_models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Gradient Boosting Regressor': GradientBoostingRegressor(),
    #'SVR': SVR()
}

# Create pipelines for each model
model_pipelines = {name: Pipeline([('preprocessor', preprocessor),
                                   ('regressor', model)]) for name, model in regression_models.items()}

#### Split Data, Train, and Evaluate Models

In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Prepare a list to hold all results
results = []

# Function to evaluate models and store results in a list
def evaluate_model(model, X_test, y_test, name):
    y_pred = model.predict(X_test)
    results.append({
        "Model": name,
        "MAE": round(mean_absolute_error(y_test, y_pred), 2),
        "MSE": round(mean_squared_error(y_test, y_pred), 2),
        "MedAE": round(median_absolute_error(y_test, y_pred), 2),
        "EVS": round(explained_variance_score(y_test, y_pred), 2),
        "R2": round(r2_score(y_test, y_pred), 2)
    })

# Train and evaluate each model
for name, pipeline in model_pipelines.items():
    pipeline.fit(X_train, y_train)
    evaluate_model(pipeline, X_test, y_test, name)

# Convert results list to DataFrame for nice table format
results_df = pd.DataFrame(results)

# Optional: Format the 'MSE' column to show commas for thousands
results_df['MSE'] = results_df['MSE'].apply(lambda x: f"{x:,.2f}")

# Display the results DataFrame
results_df


Unnamed: 0,Model,MAE,MSE,MedAE,EVS,R2
0,Linear Regression,23.0,8067.67,6.66,0.51,0.51
1,Ridge,22.82,8020.96,6.66,0.51,0.51
2,Lasso,20.17,7915.41,5.05,0.52,0.52
3,ElasticNet,20.85,8507.14,3.47,0.48,0.48
4,Decision Tree Regressor,18.19,14414.1,0.0,0.12,0.12
5,Random Forest Regressor,13.64,7683.48,0.0,0.53,0.53
6,Gradient Boosting Regressor,13.62,7160.75,0.08,0.56,0.56


### OLS

In [18]:
# One-hot encode 'device_category' and 'operating_system'
df1 = pd.get_dummies(df1, columns=['device_category', 'operating_system'], drop_first=True)

# Drop unnecessary features
df1 = df1.drop(columns=['user_pseudo_id', 'country', 'city', 'date'], errors='ignore')

# Check for NaN values and handle them if necessary
df1 = df1.fillna(0)  # Simple strategy, replace NaNs with 0 or another strategy as needed

# Ensure all data types are numeric
print(df.dtypes)


# Separate independent variables (X) and dependent variable (y)
X = df1.drop(columns=['total_revenue'])
y = df1['total_revenue']

# Add a constant term to the model for the intercept
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Display the summary which includes coefficients, p-values, R-squared, etc.
print(model.summary())


new_user                         int64
event_count                      int64
items_viewed                     int64
add_to_carts                     int64
checkouts                        int64
purchases                        int64
total_revenue                  float64
sessions                         int64
date                             int64
gluecksrad_engagement            int64
wbyo_engagement                  int64
gamification_engagement          int64
time_spent_seconds             float64
device_category_desktop          uint8
device_category_mobile           uint8
device_category_tablet           uint8
operating_system_Android         uint8
operating_system_BlackBerry      uint8
operating_system_Chrome OS       uint8
operating_system_Linux           uint8
operating_system_Macintosh       uint8
operating_system_Windows         uint8
operating_system_iOS             uint8
dtype: object
                            OLS Regression Results                            
Dep. Varia

In [23]:
# Separate independent variables (X) and dependent variable (y)
X = df1.drop(columns=['checkouts'])
y = df1['checkouts']

# Add a constant term to the model for the intercept
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Display the summary which includes coefficients, p-values, R-squared, etc.
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              checkouts   R-squared:                       0.616
Model:                            OLS   Adj. R-squared:                  0.616
Method:                 Least Squares   F-statistic:                 1.743e+04
Date:                Sat, 24 Aug 2024   Prob (F-statistic):               0.00
Time:                        15:23:23   Log-Likelihood:                -1612.4
No. Observations:              195591   AIC:                             3263.
Df Residuals:                  195572   BIC:                             3456.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             