# Predicting Product Sales

# thêm thư viện

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import joblib
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split


# đọc file .csv

In [None]:
data = pd.read_csv('product_sales_data_large.csv')

# step 1: chuyển bị data

Xử lý các giá trị bị thiếu và giá trị ngoại lai

In [None]:
data.fillna(data.mean(numeric_only=True), inplace=True)
data = data[data['sales'] > 0]  

Chuyển đổi các tính năng phân loại thành các giá trị số

In [None]:
le = LabelEncoder()
for col in data.select_dtypes(include=['object']).columns:
    data[col] = le.fit_transform(data[col])

# step 2:Kỹ thuật tính năng

In [None]:
data['product_popularity'] = data['number_of_reviews'] * data['average_rating']
data['customer_lifetime_value'] = data['purchase_frequency'] * data['price']
data['seasonality_factor'] = pd.to_datetime(data['date']).dt.month 

# step 3: chia rẽ dữ liệu

In [None]:
X = data.drop(['sales', 'date'], axis=1)
y = data['sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Chuẩn hóa các tính năng

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# step 4 : mô hình

# Random Forest


In [None]:

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluation for Random Forest


In [None]:
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_mae = mean_absolute_error(y_test, rf_predictions)
print("Random Forest - MSE:", rf_mse, "MAE:", rf_mae)

# XGBoost


In [None]:

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Evaluation for XGBoost


In [None]:
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
print("XGBoost - MSE:", xgb_mse, "MAE:", xgb_mae)

In [None]:
# Save the best model
joblib.dump(xgb_model, 'xgb_model.pkl')

# Step 5: Phân tích chuỗi thời gian


In [None]:
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)

ARIMA


In [None]:

arima_model = ARIMA(data['sales'], order=(1, 1, 1))
arima_results = arima_model.fit()
forecast_arima = arima_results.forecast(steps=12)
print("ARIMA Forecast:\n", forecast_arima)

Prophet


In [None]:

prophet_data = data.reset_index().rename(columns={'date': 'ds', 'sales': 'y'})
model_prophet = Prophet()
model_prophet.fit(prophet_data)
future_dates = model_prophet.make_future_dataframe(periods=12, freq='M')
forecast_prophet = model_prophet.predict(future_dates)
print("Prophet Forecast:\n", forecast_prophet[['ds', 'yhat']].tail(12))

# Step 6: Hệ thống đề xuất

In [None]:
reader = Reader(rating_scale=(1, 5))
df_surprise = pd.DataFrame({
    'user_id': data['user_id'],
    'product_id': data['product_id'],
    'rating': data['average_rating']
})
data_surprise = Dataset.load_from_df(df_surprise[['user_id', 'product_id', 'rating']], reader)
trainset, testset = surprise_train_test_split(data_surprise, test_size=0.2)
svd_model = SVD()
svd_model.fit(trainset)
predictions = svd_model.test(testset)

in kết quả

In [None]:
from surprise.accuracy import rmse
rmse(predictions)

# Deployment placeholder
print("Models are ready for deployment.")

source code github: https://github.com/HUyEsona/ML-project_-Predicting-Product-Sales.git