# XGBoost Model - Semiconductor Capacity Utilization
**INDENG 142A Project**

## Setup

In [None]:
# install required packages
!pip3 install pandas numpy xgboost scikit-learn matplotlib requests -q --break-system-packages

In [None]:
# import libraries for data handling, modeling, and visualization
import pandas as pd
import numpy as np
import requests
import xgboost as xgb
import matplotlib.pyplot as plt
from io import StringIO
from datetime import datetime
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error

## 1. Load Data from FRED

In [None]:
# fetch economic data from federal reserve (fred)
# caputl = semiconductor capacity utilization (target variable)
# ipg = semiconductor production index
# daup = us auto production
series = {'CAPUTLG3344S': 'CAPUTL', 'IPG3344S': 'IPG', 'DAUPSA': 'DAUP'}
data = {}

for sid, name in series.items():
    url = f"https://fred.stlouisfed.org/graph/fredgraph.csv?id={sid}&cosd=2000-01-01&coed={datetime.today().strftime('%Y-%m-%d')}"
    r = pd.read_csv(StringIO(requests.get(url).text), parse_dates=['observation_date'], index_col='observation_date')
    data[name] = pd.to_numeric(r[sid].replace('.', np.nan), errors='coerce')

df = pd.DataFrame(data).dropna()
print(f"Loaded {len(df)} observations")
df.tail()

## 2. Feature Engineering

In [None]:
# create lag features - use past values to predict future
df['CAPUTL_lag1'] = df['CAPUTL'].shift(1)  # 1 month ago
df['CAPUTL_lag3'] = df['CAPUTL'].shift(3)  # 3 months ago
df['IPG_lag1'] = df['IPG'].shift(1)
df['IPG_lag3'] = df['IPG'].shift(3)
df['DAUP_lag3'] = df['DAUP'].shift(3)

# create rolling averages - smooth out noise
df['CAPUTL_ma3'] = df['CAPUTL'].rolling(3).mean()
df['IPG_ma3'] = df['IPG'].rolling(3).mean()

# add seasonality features
df['Month'] = df.index.month
df['Quarter'] = df.index.quarter

# interaction term - captures relationship between production and auto demand
df['IPG_x_DAUP'] = df['IPG'] * df['DAUP']

# drop rows with missing values from lagging
df = df.dropna()
df.head()

## 3. Train/Test Split

In [None]:
# define the 10 features we'll use for prediction
features = ['CAPUTL_lag1', 'CAPUTL_lag3', 'IPG_lag1', 'IPG_lag3', 'DAUP_lag3',
            'CAPUTL_ma3', 'IPG_ma3', 'Month', 'Quarter', 'IPG_x_DAUP']

# split data: train on pre-2020, test on 2020 onwards
# this tests how well model handles covid-era disruptions
X_train = df.loc[:'2019', features]
y_train = df.loc[:'2019', 'CAPUTL']
X_test = df.loc['2020':, features]
y_test = df.loc['2020':, 'CAPUTL']

print(f"Train: {len(X_train)} | Test: {len(X_test)}")

## 4. Hyperparameter Tuning

In [None]:
# define hyperparameters to search over
# learning_rate = how fast model learns (lower = more conservative)
# n_estimators = number of trees in ensemble
# max_depth = how deep each tree can grow
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100],
    'max_depth': [3, 5]
}

# use time series cross-validation to find best parameters
# this respects temporal order of data (no future data leakage)
grid = GridSearchCV(
    xgb.XGBRegressor(random_state=42),
    param_grid,
    cv=TimeSeriesSplit(n_splits=3),
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best parameters:", grid.best_params_)

## 5. Evaluate Model

In [None]:
# make predictions on test set
y_pred = grid.predict(X_test)

# calculate error metrics
# mae = average absolute error in percentage points
# rmse = penalizes large errors more heavily
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Test MAE:  {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")

## 6. Feature Importance

In [None]:
# extract feature importance from trained model
# shows which features contribute most to predictions
importance = pd.DataFrame({
    'Feature': features,
    'Importance': grid.best_estimator_.feature_importances_
}).sort_values('Importance', ascending=False)

importance

## 7. Visualizations

In [None]:
# create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# left plot: actual vs predicted over time
ax1.plot(y_test.index, y_test, label='Actual', linewidth=2)
ax1.plot(y_test.index, y_pred, '--', label='Predicted', linewidth=2)
ax1.set_xlabel('Date')
ax1.set_ylabel('Capacity Utilization (%)')
ax1.legend()
ax1.tick_params(axis='x', rotation=45)

# right plot: feature importance ranking
ax2.barh(importance['Feature'], importance['Importance'])
ax2.set_xlabel('Importance')
ax2.invert_yaxis()

plt.tight_layout()
plt.savefig('xgboost_results.png', dpi=150)
plt.show()