In [None]:
!pip install pandas numpy matplotlib sqlalchemy pymysql pmdarima scikit-learn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from pmdarima import auto_arima
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import warnings

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

print("Starting production prediction model...")

In [None]:
# Data Ingestion
DB_USER = 'user'
DB_PASSWORD = 'password' 
DB_HOST = '127.0.0.1' # Nama service MySQL di docker-compose
DB_NAME = 'coal_mining'

# Membuat koneksi SQLAlchemy
try:
    engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')
    query = "SELECT cast(date as date) as date, total_tons_mined_daily FROM daily_production_metrics WHERE date<'2025-07-01' ORDER BY date ASC;"
    df = pd.read_sql(query, engine)
    print("Data successfully loaded from MySQL.")
except Exception as e:
    print(f"Error connecting to DB or loading data: {e}")
    exit()

In [None]:
# Data Preprocessing
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# CHeck if there is null value
if df['total_tons_mined_daily'].isnull().any():
    print("Warning: Missing values found in 'total_tons_mined_daily'. Filling with forward fill.")
    df['total_tons_mined_daily'].fillna(method='ffill', inplace=True) # fill with previous value


print(f"Data shape after preprocessing: {df.shape}")
df.head()

In [None]:
# EDA
plt.figure(figsize=(12, 6))
plt.plot(df['total_tons_mined_daily'])
plt.title('Daily Production Trends')
plt.xlabel('Date')
plt.ylabel('Total Tons Mined')
plt.grid(True)
plt.show()

In [None]:
# ARIMA Mode train
# Split data to 80% for train and 20% for test
train_size = int(len(df) * 0.8)
train_data, test_data = df[0:train_size], df[train_size:]

print(f"\nTraining data size: {len(train_data)} rows")
print(f"Testing data size: {len(test_data)} rows")

In [None]:
model = auto_arima(train_data['total_tons_mined_daily'],
                   start_p=1, start_q=1,
                   test='adf',       # Uji stasioneritas menggunakan ADF
                   max_p=5, max_q=5, # Batas maksimum untuk p dan q
                   m=1,              # Frekuensi musiman (m=1 jika non-seasonal, m=7 jika daily with weekly pattern)
                   d=None,           # Biarkan auto_arima menentukan 'd'
                   seasonal=False,   # Atur ke True jika ada musiman, lalu sesuaikan 'm'
                   trace=True,       # Cetak proses fitting
                   error_action='ignore',
                   suppress_warnings=True,
                   stepwise=True)    # Gunakan algoritma stepwise

print("\nBest ARIMA Model Parameters:")
print(model.summary())

In [None]:
# Evaluation
# Model Testing
forecast_steps = len(test_data)
predictions = model.predict(n_periods=forecast_steps)
predictions_series = pd.Series(predictions, index=test_data.index)

# Evaluation Metric
rmse = sqrt(mean_squared_error(test_data['total_tons_mined_daily'], predictions_series))
mae = mean_absolute_error(test_data['total_tons_mined_daily'], predictions_series)

print(f"\nModel Evaluation on Test Data:")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")

In [None]:
# Prediction Visualization
plt.figure(figsize=(14, 7))
plt.plot(train_data['total_tons_mined_daily'], label='Training Data')
plt.plot(test_data['total_tons_mined_daily'], label='Actual Test Data', color='orange')
plt.plot(predictions_series, label='Predicted Test Data', color='green', linestyle='--')
plt.title('Daily Production Prediction vs Actual (Test Set)')
plt.xlabel('Date')
plt.ylabel('Total Tons Mined')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Next Day Production
final_model = auto_arima(df['total_tons_mined_daily'],
                   start_p=1, start_q=1,
                   test='adf',
                   max_p=5, max_q=5,
                   m=1,
                   d=None,
                   seasonal=False,
                   trace=False, 
                   error_action='ignore',
                   suppress_warnings=True,
                   stepwise=True)

In [None]:
next_day_forecast = final_model.predict(n_periods=1)

last_date_in_data = df.index.max()
next_day_date = last_date_in_data + pd.Timedelta(days=1)

print(f"\nPredicted production for {next_day_date.strftime('%Y-%m-%d')}: {next_day_forecast[0]:.2f} tons")

print("\nProduction prediction model finished.")