
# USD Time Series — EDA & Forecast

**Goal:** Explore and model the daily USD exchange rate to predict the next-day value.  
Dataset: `Daily.csv` (1978—2025, daily granularities; multiple currencies; we will focus on **USD**).

**Deliverables for the project:**
- EDA (quality checks, trends, seasonality, correlations)
- Feature engineering (lags, moving averages, calendar features)
- Time-aware train/test split
- Modeling (baseline + tree-based)
- Metrics (MAE, RMSE, MAPE) and forecast vs actuals
- Ideas for monitoring & drift analysis


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from pathlib import Path
import warnings
warnings.filterwarnings('ignore')


## Load & Inspect Data

In [None]:

DATA_PATH = Path("../data/Daily.csv")

# читаем данные(загружаем)
df_raw = pd.read_csv(DATA_PATH)
print(df_raw.shape)
df_raw.head()


## Cleaning & Type Conversions

In [None]:

df = df_raw.copy()

# Parse dates — dataset examples look like MM/DD/YYYY
df['Date'] = pd.to_datetime(df['Date'], errors='coerce', infer_datetime_format=True)

# Identify numeric columns (currencies) excluding Date
num_cols = [c for c in df.columns if c != 'Date']

# Remove thousands separators (',') then convert to float
for c in num_cols:
    df[c] = pd.to_numeric(df[c].astype(str).str.replace(',', ''), errors='coerce')

# Sort by date and drop duplicates
df = df.drop_duplicates(subset=['Date']).sort_values('Date').reset_index(drop=True)

# Keep only rows with at least USD present for our target
df = df[~df['USD'].isna()].copy()

print(df.dtypes)
df.describe(include='all').T


## EDA: Missingness & Basic Statistics

In [None]:

missing_summary = df.isna().mean().sort_values(ascending=False)
missing_summary


In [None]:

# Plot USD over time
plt.figure(figsize=(12,4))
plt.plot(df['Date'], df['USD'])
plt.title('USD over time')
plt.xlabel('Date')
plt.ylabel('USD')
plt.show()


In [None]:

# Calendar features for exploratory grouping
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofweek'] = df['Date'].dt.dayofweek

# Yearly overview (mean USD)
yearly = df.groupby('year', as_index=False)['USD'].mean()
plt.figure(figsize=(12,4))
plt.plot(yearly['year'], yearly['USD'])
plt.title('Yearly mean USD')
plt.xlabel('Year')
plt.ylabel('Mean USD')
plt.show()


## Seasonal Decomposition (optional)

In [None]:

try:
    from statsmodels.tsa.seasonal import seasonal_decompose
    # Ensure regular frequency
    df_ts = df.set_index('Date').asfreq('D')
    # Forward fill gaps for decomposition
    df_ts['USD'] = df_ts['USD'].ffill()
    result = seasonal_decompose(df_ts['USD'], model='additive', period=365)
    result.plot()
    plt.show()
except Exception as e:
    print('Decomposition skipped:', e)


## Feature Engineering

In [None]:

work = df.copy()

# Lag features
for lag in [1, 3, 7, 14, 30]:
    work[f'USD_lag_{lag}'] = work['USD'].shift(lag)

# Moving averages
for win in [3, 7, 14, 30]:
    work[f'USD_ma_{win}'] = work['USD'].rolling(window=win, min_periods=1).mean()

# Calendar features already present
# Drop early rows with NaNs after lagging
work = work.dropna().reset_index(drop=True)

feature_cols = [c for c in work.columns if c not in ['USD', 'Date']]
X = work[feature_cols]
y = work['USD']

X.shape, y.shape


## Train/Test Split (Time Aware)

In [None]:

# Use last ~3 years for test
last_date = work['Date'].max()
test_start = last_date - pd.Timedelta(days=365*3)

train_idx = work['Date'] < test_start
test_idx  = work['Date'] >= test_start

X_train, y_train = X[train_idx], y[train_idx]
X_test,  y_test  = X[test_idx], y[test_idx]

print('Train range:', work.loc[train_idx, 'Date'].min(), '→', work.loc[train_idx, 'Date'].max(), len(X_train))
print('Test  range:', work.loc[test_idx, 'Date'].min(),  '→', work.loc[test_idx, 'Date'].max(),  len(X_test))


## Modeling: Baselines (Linear Regression, Random Forest)

In [None]:

models = {}

# Linear Regression
lin = LinearRegression()
lin.fit(X_train, y_train)
models['LinearRegression'] = lin

# Random Forest
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
models['RandomForest'] = rf

# Evaluate
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    eps = 1e-9
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100

results = []
for name, model in models.items():
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mp = mape(y_test, pred)
    results.append({'model': name, 'MAE': mae, 'RMSE': rmse, 'MAPE_%': mp})

res_df = pd.DataFrame(results).sort_values('RMSE')
res_df


## Predictions vs Actuals (Test Period)

In [None]:

best_model_name = res_df.iloc[0]['model']
best_model = models[best_model_name]
pred = best_model.predict(X_test)

plot_df = work.loc[test_idx, ['Date', 'USD']].copy()
plot_df['Pred'] = pred

plt.figure(figsize=(12,4))
plt.plot(plot_df['Date'], plot_df['USD'], label='Actual')
plt.plot(plot_df['Date'], plot_df['Pred'], label='Predicted')
plt.title(f'USD — Actual vs Predicted ({best_model_name})')
plt.xlabel('Date')
plt.ylabel('USD')
plt.legend()
plt.show()


## One-step Ahead Forecast (Naive Demo)

In [None]:

# Take the last available row to forecast the next day
last_row = work.iloc[-1:].copy()

# Simulate next-day features
next_date = last_row['Date'].iloc[0] + pd.Timedelta(days=1)
# Recompute lag features using last known USD
# Since we already have all lags in 'last_row', we'll keep them
next_features = last_row.drop(columns=['USD', 'Date']).copy()

next_pred = best_model.predict(next_features)[0]
print('Last date:', last_row['Date'].iloc[0].date())
print('Next date (forecast):', next_date.date())
print('Forecasted USD:', float(next_pred))


## Data Drift & Stability (Optional — Evidently)

In [None]:

try:
    from evidently.report import Report
    from evidently.metric_preset import DataDriftPreset
    ref = work[work['Date'] < work['Date'].max() - pd.Timedelta(days=365*3)].copy()
    cur = work[work['Date'] >= work['Date'].max() - pd.Timedelta(days=365*3)].copy()

    # Use features only for drift; avoid target leakage
    ref_X = ref.drop(columns=['USD', 'Date'])
    cur_X = cur.drop(columns=['USD', 'Date'])

    report = Report(metrics=[DataDriftPreset()])
    report.run(reference_data=ref_X, current_data=cur_X)
    report_path = '/mnt/data/evidently_drift_report.html'
    report.save_html(report_path)
    print('Drift report saved to:', report_path)
except Exception as e:
    print('Evidently not installed or failed:', e)



## Conclusions & Next Steps
- We explored long-term trends, seasonality and calendar effects for USD.
- Baseline models (Linear Regression, Random Forest) provide a benchmark.
- Best model (by RMSE) is plotted against the test period.
- Next steps:
  - Hyperparameter tuning (RandomizedSearchCV/GridSearchCV)
  - Try gradient boosting (XGBoost/CatBoost) or Prophet/LSTM
  - Add exogenous features (other currencies, macro indicators)
  - Integrate Evidently into a monitoring pipeline
