In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
file_path = './../data/processed/Ecommerce_final.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

In [None]:
df['Age Group'].value_counts(normalize=True)

In [None]:
df['Age Group'] = pd.Categorical(
    df['Age Group'], 
    categories=['under 18', '18-25', '25-45', '45-60', '60 and above'],
    ordered=True
)
df_final = pd.get_dummies(df, columns=["Age Group"], drop_first=True)

In [None]:
df_final.head()

In [None]:
X_cols = [col for col in df_final.columns if "Age Group_" in col]
X = df_final[X_cols]
y = df['Gross Amount']

In [None]:
model_raw = LinearRegression()
model_raw.fit(X,y)
y_pred_raw = model_raw.predict(X)


In [None]:
print("Intercept (середня витрата для <18):", model_raw.intercept_)

coef = pd.Series(model_raw.coef_, index=X.columns)
print("\nКоефіцієнти для кожної вікової групи (відносно <18):")
print(coef)

In [None]:
mse = mean_squared_error(y, y_pred_raw)
r2 = r2_score(y, y_pred_raw)
print("\nMSE:", mse)
print("R²:", r2)

In [None]:
import statsmodels.api as sm

In [None]:
X = df_final[X_cols].astype(int)

# 2️⃣ Додаємо константу для intercept
X_sm = sm.add_constant(X)
# 3️⃣ Створюємо та навчаємо модель OLS
model_sm = sm.OLS(y, X_sm).fit()

# 4️⃣ Виводимо тільки p-values
print("P-values для вікових груп (відносно <18):")
print(model_sm.pvalues)

In [None]:

X

In [None]:
#Подивимось які зі зв'язків інших колонок з Gross Amount не є випадковими

In [None]:
drop_cols = ['Gross Amount', 'CID', 'TID', 'Purchase Date']

X = df_final.drop(columns=drop_cols)
y = df_final['Gross Amount'].astype(float)

In [None]:
categorical_cols = ['Gender',
                    'Product Category',
                    'Discount Availed',
                    'Discount Name',
                    'Purchase Method',
                    'Location']

In [None]:
numeric_cols = [col for col in numeric_cols if col != 'Gross Amount']
df_new = df_final[numeric_cols]
df_new.head()

In [None]:
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [None]:
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)

In [None]:
X_sm = sm.add_constant(X)
model_sm = sm.OLS(y, X_sm).fit()

In [None]:
print("P-values для всіх змінних:")
significant_vars = model_sm.pvalues[model_sm.pvalues < 0.05]
significant_vars