In [1]:
# Impor library
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np

# Ensemble learning random forest
from sklearn.ensemble import RandomForestRegressor
#gradient boosting
from sklearn.ensemble import GradientBoostingRegressor
#xgboost
import xgboost as xgb

In [2]:
# Membaca dataset
retail_data = pd.read_csv('dataset/train.csv')

In [3]:
print(retail_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   object 
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(3), int64(1), object(8)
memory usage: 799.2+ KB
None


In [4]:
print(retail_data.head())

   Item_Identifier   Item_Weight   Item_Fat_Content  Item_Visibility  \
0  FDA15             9.3           Low Fat                  0.016047   
1  DRC01             5.92          Regular                  0.019278   
2  FDN15             17.5          Low Fat                  0.016760   
3  FDX07             19.2          Regular                  0.000000   
4  NCD19             8.93          Low Fat                  0.000000   

    Item_Type              Item_MRP   Outlet_Identifier  \
0   Dairy                  249.8092   OUT049              
1   Soft Drinks             48.2692   OUT018              
2   Meat                   141.6180   OUT049              
3   Fruits and Vegetables  182.0950   OUT010              
4   Household               53.8614   OUT013              

   Outlet_Establishment_Year   Outlet_Size   Outlet_Location_Type  \
0                       1999   Medium        Tier 1                 
1                       2009   Medium        Tier 3                 
2    

In [5]:
# Memilih fitur yang relevan dan target
X = retail_data[['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']]
y = retail_data['Item_Outlet_Sales']

In [7]:
imputer = SimpleImputer(strategy='most_frequent')
X = imputer.fit_transform(X)

In [8]:
# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## LINEAR REGRESSION

In [9]:
# Inisialisasi model regresi linier
linear_model = LinearRegression()

In [10]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)

Shape of X_train: (6818, 4)
Shape of y_train: (6818,)


In [11]:
# Melatih model pada data latih
linear_model.fit(X_train, y_train)

ValueError: could not convert string to float: '            '

In [None]:
# Membuat prediksi menggunakan data uji
linear_predictions = linear_model.predict(X_test)

In [None]:
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_rmse = root_mean_squared_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)

print('Linear Regression Mean Absolute Error:', linear_mae)
print('Linear Regression Root Mean Squared Error:', linear_rmse)
print('Linear Regression R-squared:', linear_r2)

## RANDOMFORESTREGRESSOR

In [None]:
#randomforest
random_forest_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
random_forest_model.fit(X_train, y_train)

In [None]:
rf_predictions = random_forest_model.predict(X_test)

In [None]:
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = root_mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print('Random Forest Mean Absolute Error:', rf_mae)
print('Random Forest Root Mean Squared Error:', rf_rmse)
print('Random Forest R-squared:', rf_r2)

## XGBOOST

In [None]:
#xgboost
xg_model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 3, alpha = 10, n_estimators = 100)

In [None]:
#latih model
xg_model.fit(X_train, y_train)

In [None]:
#predict
xg_predictions = xg_model.predict(X_test)

In [None]:
#performa
xgb_mae = mean_absolute_error(y_test, xg_predictions)
xgb_rmse = root_mean_squared_error(y_test, xg_predictions)
xgb_r2 = r2_score(y_test, xg_predictions)

print('XGBoost Mean Absolute Error:', xgb_mae)
print('XGBoost Root Mean Squared Error:', xgb_rmse)
print('XGBoost R-squared:', xgb_r2)

## GRADIENTBOOST

In [None]:
#gradientboost
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

In [None]:
gradient_boosting_model.fit(X_train, y_train)

In [None]:
gb_predictions = gradient_boosting_model.predict(X_test)

In [None]:
#performa
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_rmse = root_mean_squared_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

print('Gradient Boosting Mean Absolute Error:', gb_mae)
print('Gradient Boosting Root Mean Squared Error:', gb_rmse)
print('Gradient Boosting R-squared:', gb_r2)

In [None]:
# Evaluasi model
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f'{model_name} Mean Absolute Error:', mae)
    print(f'{model_name} Root Mean Squared Error:', rmse)
    print(f'{model_name} R-squared:', r2)
    return mae, rmse, r2


## MAE, RMSE, R2

In [None]:
linear_mae, linear_rmse, linear_r2 = evaluate_model(y_test, linear_predictions, 'Linear Regression')
print('\n')
rf_mae, rf_rmse, rf_r2 = evaluate_model(y_test, rf_predictions, 'Random Forest')
print('\n')
xgb_mae, xgb_rmse, xgb_r2 = evaluate_model (y_test, xg_predictions, 'XGBoost')
print('\n')
gb_mae, gb_rmse, gb_r2 = evaluate_model(y_test, gb_predictions, 'Gradient Boosting')

## PREDIKSI

In [None]:
# Prediksi penjualan menggunakan model-model yang telah dilatih
linear_sales_prediction = linear_model.predict(X_test)
rf_sales_prediction = random_forest_model.predict(X_test)
gb_sales_prediction = gradient_boosting_model.predict(X_test)

# Output hasil prediksi
print("Prediksi Penjualan Retail:")
print("- Model Random Forest:", rf_sales_prediction)
print("- Model Gradient Boosting:", gb_sales_prediction)
print("- Model XGBoost:", xg_predictions)

# Output nilai aktual
print("\nNilai Aktual Penjualan Retail:")
print(y_test.values)

In [None]:
y_test_actual = y_test.values.tolist()
for prediksi_linear, prediksi_rf, prediksi_gb, prediksi_xg, aktual in zip(linear_predictions, rf_predictions, gb_predictions, xg_predictions, y_test_actual):
    print("Linear Regression Prediksi:", prediksi_linear, "| Random Forest Prediksi:", prediksi_rf, "| Gradient Boosting Prediksi:", prediksi_gb, "| XGBoost Prediksi:", prediksi_xg, "| Aktual:", aktual)


## PENGGABUNGAN ALGORTIMA

In [None]:
ensemble_predictions = (rf_predictions + gb_predictions + xg_predictions) / 3

In [None]:
ensemble_mae = mean_absolute_error(y_test, ensemble_predictions)
ensemble_rmse = np.sqrt(root_mean_squared_error(y_test, ensemble_predictions))
ensemble_r2 = r2_score(y_test, ensemble_predictions)

print('Ensemble Mean Absolute Error:', ensemble_mae)
print('Ensemble Root Mean Squared Error:', ensemble_rmse)
print('Ensemble R-squared:', ensemble_r2)

In [None]:
for prediksi_ensemble, aktual in zip (ensemble_predictions, y_test_actual):
    print("Ensemble : ", prediksi_ensemble, "Aktual : ",aktual)

## AKURASI

In [None]:
# Hitung nilai maksimal dari target
max_target = y_test.max()

# Menghitung akurasi untuk setiap model
linear_accuracy = 100 * (1 - linear_rmse / max_target)
rf_accuracy = 100 * (1 - rf_rmse / max_target)
gb_accuracy = 100 * (1 - gb_rmse / max_target)
xg_accuracy = 100 * (1 - xgb_rmse / max_target)
ensemble_accuracy = 100 * (1 - ensemble_rmse / max_target)

print('Akurasi Linear Regression :', linear_accuracy, '%')
print('Akurasi XGBoost : ', xg_accuracy, '%')
print('Akurasi Random Forest :', rf_accuracy, '%')
print('Akurasi Gradient Boosting : ', gb_accuracy, '%')
print('Akurasi Ensemble: ', ensemble_accuracy, '%')

In [None]:
residuals = y_test - rf_predictions
plt.scatter(y_test, residuals)
plt.xlabel('Nilai Aktual')
plt.ylabel('Residuals')
plt.title('Plot Residu')
plt.show()

## VISUALISASI

In [None]:
# Visualisasi prediksi vs nilai aktual untuk model regresi linear
plt.scatter(y_test, linear_predictions, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Nilai Aktual')
plt.ylabel('Prediksi') 
plt.title('Prediksi vs Nilai Aktual (Linear Regression)')
plt.show()

In [None]:
# Visualisasi prediksi vs nilai aktual untuk model Random Forest
plt.scatter(y_test, rf_predictions, color='green')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Nilai Aktual')
plt.ylabel('Prediksi')
plt.title('Prediksi vs Nilai Aktual (Random Forest)')
plt.show()

In [None]:
#xgboost
plt.scatter(y_test, xg_predictions, color='purple')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Nilai Aktual')
plt.ylabel('Prediksi')
plt.title('Prediksi vs Nilai Aktual (XGBoost)')
plt.show()

In [None]:
# Visualisasi prediksi vs nilai aktual untuk model Gradient Boosting
plt.scatter(y_test, gb_predictions, color='red')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Nilai Aktual')
plt.ylabel('Prediksi')
plt.title('Prediksi vs Nilai Aktual (Gradient Boosting)')
plt.show()

In [None]:
plt.scatter(y_test, ensemble_predictions, color='cyan')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.xlabel('Nilai Aktual')
plt.ylabel('Prediksi')
plt.title('Prediksi vs Nilai Aktual (Ensemble)')
plt.show()