In [5]:
import pandas as pd
import numpy as np
import scipy.stats as stats 
import matplotlib.pyplot as plt
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm
# Filter the dataframe for male entrie
plt.style.use('seaborn-v0_8-whitegrid')

# Load the data
df = pd.read_csv('./datasets/walmart_data.csv')
df.columns = [c.lower() for c in df.columns]
df.head()

Unnamed: 0,user_id,product_id,gender,age,occupation,city_category,stay_in_current_city_years,marital_status,product_category,purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,7969


In [22]:
user_agg_df = df.groupby(['user_id']).agg({
    'product_id': 'count',
    'purchase': 'sum',
    'occupation': 'min',
    'gender': 'min',
    'age': 'min',
    'city_category': 'min',
    'stay_in_current_city_years': 'min',
    'marital_status': 'min',
}).reset_index()

user_agg_df.head()

Unnamed: 0,user_id,product_id,purchase,occupation,gender,age,city_category,stay_in_current_city_years,marital_status
0,1000001,35,334093,10,F,0-17,A,2,0
1,1000002,77,810472,16,M,55+,C,4+,0
2,1000003,29,341635,15,M,26-35,A,3,0
3,1000004,14,206468,7,M,46-50,B,2,1
4,1000005,106,821001,20,M,26-35,A,1,1


In [28]:
user_agg_df

Unnamed: 0,user_id,product_id,purchase,occupation,gender,age,city_category,stay_in_current_city_years,marital_status
0,1000001,35,334093,10,F,0-17,A,2,0
1,1000002,77,810472,16,M,55+,C,4+,0
2,1000003,29,341635,15,M,26-35,A,3,0
3,1000004,14,206468,7,M,46-50,B,2,1
4,1000005,106,821001,20,M,26-35,A,1,1
...,...,...,...,...,...,...,...,...,...
5886,1006036,514,4116058,15,F,26-35,B,4+,1
5887,1006037,122,1119538,1,F,46-50,C,4+,0
5888,1006038,12,90034,1,F,55+,C,2,0
5889,1006039,74,590319,0,F,46-50,B,4+,1


In [15]:
user_agg_df.groupby(['gender']).purchase.mean()

gender
F    712024.394958
M    925344.402367
Name: purchase, dtype: float64

In [17]:
Y = user_agg_df['purchase']
X = (user_agg_df[['gender']]=='F').astype(int)
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               purchase   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     61.69
Date:                Thu, 13 Feb 2025   Prob (F-statistic):           4.75e-15
Time:                        19:06:40   Log-Likelihood:                -89373.
No. Observations:                5891   AIC:                         1.788e+05
Df Residuals:                    5889   BIC:                         1.788e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       9.253e+05   1.44e+04     64.067      0.0

In [19]:
user_agg_df['CityC'] = (user_agg_df['city_category'] == 'C').astype(int)
user_agg_df['CityB'] = (user_agg_df['city_category'] == 'B').astype(int)
user_agg_df['Female'] = (user_agg_df[['gender']]=='F').astype(int)

In [20]:
Y = user_agg_df['purchase']
X = user_agg_df[['CityC', 'CityB', 'Female']]
X = sm.add_constant(X)
model = sm.OLS(Y, X)
results = model.fit()
# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               purchase   R-squared:                       0.155
Model:                            OLS   Adj. R-squared:                  0.155
Method:                 Least Squares   F-statistic:                     360.9
Date:                Thu, 13 Feb 2025   Prob (F-statistic):          3.68e-215
Time:                        19:11:00   Log-Likelihood:                -88907.
No. Observations:                5891   AIC:                         1.778e+05
Df Residuals:                    5887   BIC:                         1.778e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.323e+06   2.78e+04     47.679      0.0

In [31]:
user_agg_df = df.groupby(['user_id']).agg({
    'product_id': 'count',
    'purchase': 'sum',
    'occupation': 'min',
    'gender': 'min',
    'age': 'min',
    'city_category': 'min',
    'stay_in_current_city_years': 'min',
    'marital_status': 'min',
}).reset_index()

columns = user_agg_df.columns


In [None]:
features = ['gender', 'age']
df_encoded = pd.get_dummies(user_agg_df, columns=features, drop_first=True)

Y = df_encoded['purchase']
X = df_encoded.drop(columns=[c for c in columns if c not in features + ['product_id']]).astype(float) # product_id is th enumber of proudcts
X = sm.add_constant(X)

model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               purchase   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.960
Method:                 Least Squares   F-statistic:                 1.749e+04
Date:                Thu, 13 Feb 2025   Prob (F-statistic):               0.00
Time:                        19:39:17   Log-Likelihood:                -79947.
No. Observations:                5891   AIC:                         1.599e+05
Df Residuals:                    5882   BIC:                         1.600e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.163e+04   1.34e+04     -1.615      0.1