In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
file_path = './../data/processed/Ecommerce_final.csv'
df = pd.read_csv(file_path)

In [5]:
df.head()

Unnamed: 0,CID,TID,Gender,Age Group,Purchase Date,Product Category,Discount Availed,Discount Name,Discount Amount (INR),Gross Amount,Net Amount,Purchase Method,Location,is_discount_overflow
0,943146,5876328741,Female,25-45,2023-08-30 20:27:08,Electronics,Yes,FESTIVE50,64.3,725.304,661.004,Credit Card,Ahmedabad,False
1,180079,1018503182,Male,25-45,2024-02-23 09:33:46,Electronics,Yes,SEASONALOFFER21,175.19,4638.991875,4463.801875,Credit Card,Bangalore,False
2,337580,3814082218,Other,60 and above,2022-03-06 09:09:50,Clothing,Yes,SEASONALOFFER21,211.54,1986.372575,1774.832575,Credit Card,Delhi,False
3,180333,1395204173,Other,60 and above,2020-11-04 04:41:57,Sports & Fitness,No,No Discount,0.0,5695.61265,5695.61265,Debit Card,Delhi,False
4,447553,8009390577,Male,18-25,2022-05-31 17:00:32,Sports & Fitness,Yes,WELCOME5,439.92,2292.6515,1852.7315,Credit Card,Delhi,False


In [6]:
df['Age Group'].value_counts(normalize=True)

Age Group
25-45           0.400182
18-25           0.298745
45-60           0.201891
under 18        0.050382
60 and above    0.048800
Name: proportion, dtype: float64

In [7]:
df['Age Group'] = pd.Categorical(
    df['Age Group'], 
    categories=['under 18', '18-25', '25-45', '45-60', '60 and above'],
    ordered=True
)
df_final = pd.get_dummies(df, columns=["Age Group"], drop_first=True)

In [8]:
df_final.head()

Unnamed: 0,CID,TID,Gender,Purchase Date,Product Category,Discount Availed,Discount Name,Discount Amount (INR),Gross Amount,Net Amount,Purchase Method,Location,is_discount_overflow,Age Group_18-25,Age Group_25-45,Age Group_45-60,Age Group_60 and above
0,943146,5876328741,Female,2023-08-30 20:27:08,Electronics,Yes,FESTIVE50,64.3,725.304,661.004,Credit Card,Ahmedabad,False,False,True,False,False
1,180079,1018503182,Male,2024-02-23 09:33:46,Electronics,Yes,SEASONALOFFER21,175.19,4638.991875,4463.801875,Credit Card,Bangalore,False,False,True,False,False
2,337580,3814082218,Other,2022-03-06 09:09:50,Clothing,Yes,SEASONALOFFER21,211.54,1986.372575,1774.832575,Credit Card,Delhi,False,False,False,False,True
3,180333,1395204173,Other,2020-11-04 04:41:57,Sports & Fitness,No,No Discount,0.0,5695.61265,5695.61265,Debit Card,Delhi,False,False,False,False,True
4,447553,8009390577,Male,2022-05-31 17:00:32,Sports & Fitness,Yes,WELCOME5,439.92,2292.6515,1852.7315,Credit Card,Delhi,False,True,False,False,False


In [9]:
X_cols = [col for col in df_final.columns if "Age Group_" in col]
X = df_final[X_cols]
y = df['Gross Amount']

In [10]:
model_raw = LinearRegression()
model_raw.fit(X,y)
y_pred_raw = model_raw.predict(X)


In [11]:
print("Intercept (середня витрата для <18):", model_raw.intercept_)

coef = pd.Series(model_raw.coef_, index=X.columns)
print("\nКоефіцієнти для кожної вікової групи (відносно <18):")
print(coef)

Intercept (середня витрата для <18): 3026.510347257321

Коефіцієнти для кожної вікової групи (відносно <18):
Age Group_18-25          -41.829435
Age Group_25-45           14.577409
Age Group_45-60          -34.110870
Age Group_60 and above    -0.498505
dtype: float64


In [12]:
mse = mean_squared_error(y, y_pred_raw)
r2 = r2_score(y, y_pred_raw)
print("\nMSE:", mse)
print("R²:", r2)


MSE: 2952293.2058233977
R²: 0.00022297369370594122


In [13]:
import statsmodels.api as sm

In [14]:
X = df_final[X_cols].astype(int)

# 2️⃣ Додаємо константу для intercept
X_sm = sm.add_constant(X)
# 3️⃣ Створюємо та навчаємо модель OLS
model_sm = sm.OLS(y, X_sm).fit()

# 4️⃣ Виводимо тільки p-values
print("P-values для вікових груп (відносно <18):")
print(model_sm.pvalues)

P-values для вікових груп (відносно <18):
const                     0.000000
Age Group_18-25           0.235870
Age Group_25-45           0.673850
Age Group_45-60           0.349878
Age Group_60 and above    0.991453
dtype: float64


In [15]:

X

Unnamed: 0,Age Group_18-25,Age Group_25-45,Age Group_45-60,Age Group_60 and above
0,0,1,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,0,1
4,1,0,0,0
...,...,...,...,...
54995,0,1,0,0
54996,1,0,0,0
54997,0,0,1,0
54998,0,0,1,0


In [None]:
#Подивимось які зі зв'язків інших колонок з Gross Amount не є випадковими

In [17]:
drop_cols = ['Gross Amount', 'CID', 'TID', 'Purchase Date']

X = df_final.drop(columns=drop_cols)
y = df_final['Gross Amount'].astype(float)

In [18]:
categorical_cols = ['Gender',
                    'Product Category',
                    'Discount Availed',
                    'Discount Name',
                    'Purchase Method',
                    'Location']

In [None]:
numeric_cols = [col for col in numeric_cols if col != 'Gross Amount']
df_new = df_final[numeric_cols]
df_new.head()

In [19]:
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [20]:
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

In [21]:
X_sm = sm.add_constant(X)
model_sm = sm.OLS(y, X_sm).fit()

In [24]:
print("P-values для всіх змінних:")
significant_vars = model_sm.pvalues[model_sm.pvalues < 0.05]
significant_vars

P-values для всіх змінних:


const                        1.626091e-161
Discount Amount (INR)         0.000000e+00
Net Amount                    0.000000e+00
Age Group_18-25               2.161934e-02
Age Group_45-60               3.755546e-02
Gender_Male                   3.119110e-02
Product Category_Clothing     3.773270e-02
Discount Availed_Yes          2.988961e-59
Discount Name_No Discount    5.293314e-150
dtype: float64