In [6]:
import pandas as pd

thisdata = pd.read_csv("../data/raw/shopping_trends_updated.csv")
print(thisdata.head())
print(thisdata.info())

   Customer ID  Age Gender Item Purchased  Category  Purchase Amount (USD)  \
0            1   55   Male         Blouse  Clothing                     53   
1            2   19   Male        Sweater  Clothing                     64   
2            3   50   Male          Jeans  Clothing                     73   
3            4   21   Male        Sandals  Footwear                     90   
4            5   45   Male         Blouse  Clothing                     49   

        Location Size      Color  Season  Review Rating Subscription Status  \
0       Kentucky    L       Gray  Winter            3.1                 Yes   
1          Maine    L     Maroon  Winter            3.1                 Yes   
2  Massachusetts    S     Maroon  Spring            3.1                 Yes   
3   Rhode Island    M     Maroon  Spring            3.5                 Yes   
4         Oregon    M  Turquoise  Spring            2.7                 Yes   

   Shipping Type Discount Applied Promo Code Used  Previ

In [7]:
# Check missing values
print(thisdata.isnull().sum())

thisdata.dropna(inplace=True)

# Categorical variables
thisdata['Payment Method'] = thisdata['Payment Method'].astype('category')
thisdata['Frequency of Purchases'] = thisdata['Frequency of Purchases'].astype('category')

print(thisdata.dtypes)

Customer ID               0
Age                       0
Gender                    0
Item Purchased            0
Category                  0
Purchase Amount (USD)     0
Location                  0
Size                      0
Color                     0
Season                    0
Review Rating             0
Subscription Status       0
Shipping Type             0
Discount Applied          0
Promo Code Used           0
Previous Purchases        0
Payment Method            0
Frequency of Purchases    0
dtype: int64
Customer ID                  int64
Age                          int64
Gender                      object
Item Purchased              object
Category                    object
Purchase Amount (USD)        int64
Location                    object
Size                        object
Color                       object
Season                      object
Review Rating              float64
Subscription Status         object
Shipping Type               object
Discount Applied            

In [8]:
# Rename columns because of space in names (BIG PROBLEM)
thisdata.rename(columns={
    'Purchase Amount (USD)': 'Purchase_Amount_USD',
    'Frequency of Purchases': 'Frequency_of_Purchases',
    'Payment Method': 'Payment_Method'
}, inplace=True)

PART 1. Correlation between payment method and frequency of purchases

In [9]:
from scipy.stats import chi2_contingency
# SINCE BOTH VARIABLES PAYMENT METHOD & FREQUENY OF PURCHASES ARE CATEGORICAL VARIABLES. Hence used chi-square test.
contingency_table = pd.crosstab(thisdata['Payment_Method'], thisdata['Frequency_of_Purchases'])
# Chi-square
chi2, p, dof, ex = chi2_contingency(contingency_table)
print("Chi-square test statistic:", chi2)
print("p-value:",p)

Chi-square test statistic: 34.490501807370485
p-value: 0.26165013012971255


PART 2.Correlation between payment amount and frequency of purchases.

In [10]:
# One variable is numeric one is categorical. Hence used ANOVA.

import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('Purchase_Amount_USD ~ C(Frequency_of_Purchases)', data=thisdata).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                                 sum_sq      df         F    PR(>F)
C(Frequency_of_Purchases)  1.371183e+03     6.0  0.406993  0.874853
Residual                   2.185959e+06  3893.0       NaN       NaN


Part 3. Correlation between payment amount and payment method.

In [11]:
model = ols('Purchase_Amount_USD ~ C(Payment_Method)', data=thisdata).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                         sum_sq      df         F   PR(>F)
C(Payment_Method)  1.514203e+03     5.0  0.539506  0.74648
Residual           2.185816e+06  3894.0       NaN      NaN


Hypothesis - Customer spends more money when using credit card/paypal/venmo than when using bank transfer/cash

In [12]:
import scipy.stats as stats

credit_card = thisdata[thisdata['Payment_Method'] == 'Credit Card']['Purchase_Amount_USD']
paypal = thisdata[thisdata['Payment_Method'] == 'PayPal']['Purchase_Amount_USD']
venmo = thisdata[thisdata['Payment_Method'] == 'Venmo']['Purchase_Amount_USD']
bank_transfer = thisdata[thisdata['Payment_Method'] == 'Bank Transfer']['Purchase_Amount_USD']
cash = thisdata[thisdata['Payment_Method'] == 'Cash']['Purchase_Amount_USD']

# T TEsts
t_stat, p_value = stats.ttest_ind(credit_card, bank_transfer)
print(f'Credit Card vs Bank Transfer: t-statistic = {t_stat}, p-value = {p_value}')

t_stat, p_value = stats.ttest_ind(credit_card, cash)
print(f'Credit Card vs Cash: t-statistic = {t_stat}, p-value = {p_value}')

t_stat, p_value = stats.ttest_ind(paypal, bank_transfer)
print(f'PayPal vs Bank Transfer: t-statistic = {t_stat}, p-value = {p_value}')

t_stat, p_value = stats.ttest_ind(paypal, cash)
print(f'PayPal vs Cash: t-statistic = {t_stat}, p-value = {p_value}')

t_stat, p_value = stats.ttest_ind(venmo, bank_transfer)
print(f'Venmo vs Bank Transfer: t-statistic = {t_stat}, p-value = {p_value}')

t_stat, p_value = stats.ttest_ind(venmo, cash)
print(f'Venmo vs Cash: t-statistic = {t_stat}, p-value = {p_value}')

Credit Card vs Bank Transfer: t-statistic = 0.2768772161162005, p-value = 0.7819190567483248
Credit Card vs Cash: t-statistic = 0.29182924247799114, p-value = 0.7704623907835844
PayPal vs Bank Transfer: t-statistic = -0.35027713275144834, p-value = 0.726188042230721
PayPal vs Cash: t-statistic = -0.35536861949469034, p-value = 0.7223691755747064
Venmo vs Bank Transfer: t-statistic = -0.5684275128089873, p-value = 0.5698473043590351
Venmo vs Cash: t-statistic = -0.580125207015135, p-value = 0.561930564342868


Regression analysis - Dependant variable purchase_amount_usd / Independant variables are Age, Previous Purchases, Frequecy_of_Purchases and Payment_method

In [16]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

thisdata['Payment_Method'] = thisdata['Payment_Method'].astype('category')
thisdata['Frequency_of_Purchases'] = thisdata['Frequency_of_Purchases'].astype('category')

# Categorical datatype
thisdata['Payment_Method'] = thisdata['Payment_Method'].cat.codes
thisdata['Frequency_of_Purchases'] = thisdata['Frequency_of_Purchases'].cat.codes

X = thisdata[['Age', 'Previous Purchases', 'Frequency_of_Purchases', 'Payment_Method']]
y = thisdata['Purchase_Amount_USD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Prediction
y_pred = regressor.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

# Summary
X_train_sm = sm.add_constant(X_train)  # Adds a constant term to the predictor
model = sm.OLS(y_train, X_train_sm).fit()
print(model.summary())

# Pvalues
p_values = model.pvalues
print('P-values of the predictors:')
print(p_values)

Mean Squared Error: 559.1233236587796
R^2 Score: -0.0033623922396965877
                             OLS Regression Results                            
Dep. Variable:     Purchase_Amount_USD   R-squared:                       0.001
Model:                             OLS   Adj. R-squared:                 -0.001
Method:                  Least Squares   F-statistic:                    0.4399
Date:                 Sat, 22 Jun 2024   Prob (F-statistic):              0.780
Time:                         11:42:51   Log-Likelihood:                -12515.
No. Observations:                 2730   AIC:                         2.504e+04
Df Residuals:                     2725   BIC:                         2.507e+04
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------