In [1]:
# =========================
# ONE-CELL FULL PIPELINE
# load -> preprocess -> train -> evaluate -> save -> predict
# =========================

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

# -------- 1) PATHS (EDIT THIS) --------
FOLDER = r"C:\Users\Tharindu\Desktop\New folder (6)"

# Change this to your dataset file name inside New folder (6)
DATASET_FILE = "sri_lanka_economy_2003_2019_engineered.csv"
# or: "sri_lanka_economy_2001_2023_4features.csv"

file_path = os.path.join(FOLDER, DATASET_FILE)
print("Reading:", file_path)

# -------- 2) LOAD --------
df = pd.read_csv(file_path)
print("Loaded shape:", df.shape)
print("Columns:", list(df.columns))

# -------- 3) QUICK EDA SUMMARY --------
print("\n--- EDA ---")
print("Duplicates:", df.duplicated().sum())
print("Missing values:\n", df.isnull().sum())

# -------- 4) PREPROCESS --------
data = df.copy()
data = data.drop_duplicates()

# drop single-value columns
single_value_cols = [c for c in data.columns if data[c].nunique(dropna=True) <= 1]
if single_value_cols:
    print("Dropping single-value columns:", single_value_cols)
    data = data.drop(columns=single_value_cols)

# numeric conversion
for c in data.columns:
    if c != "Year":
        data[c] = pd.to_numeric(data[c], errors="coerce")

# impute numeric missing with median
num_cols = data.select_dtypes(include=[np.number]).columns.tolist()
data[num_cols] = data[num_cols].fillna(data[num_cols].median())

print("\nMissing after preprocessing:\n", data.isnull().sum())

# -------- 5) TARGET + FEATURES --------
target = "GDP_Growth"
if target not in data.columns:
    raise ValueError(f"Target '{target}' not found. Available columns: {list(data.columns)}")

features = [c for c in data.columns if c not in ["Year", target]]
if len(features) == 0:
    raise ValueError("No feature columns found!")

print("\nTarget:", target)
print("Features:", features)

X = data[features]
y = data[target]

# -------- 6) TRAIN/TEST SPLIT --------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\nTrain shape:", X_train.shape, y_train.shape)
print("Test shape :", X_test.shape, y_test.shape)

# -------- 7) SCALING (for LR) --------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------- 8) TRAIN MODELS --------
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

rf_model = RandomForestRegressor(n_estimators=400, random_state=42)
rf_model.fit(X_train, y_train)

print("\n✅ Training completed")

# -------- 9) EVALUATION --------
def evaluate(y_true, y_pred, name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {name} ---")
    print("MAE :", mae)
    print("MSE :", mse)
    print("RMSE:", rmse)
    print("R²  :", r2)

y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_rf = rf_model.predict(X_test)

evaluate(y_test, y_pred_lr, "Linear Regression")
evaluate(y_test, y_pred_rf, "Random Forest")

# -------- 10) SAVE (PKL) --------
lr_path = os.path.join(FOLDER, "linear_regression_model.pkl")
rf_path = os.path.join(FOLDER, "random_forest_model.pkl")
scaler_path = os.path.join(FOLDER, "scaler.pkl")
features_path = os.path.join(FOLDER, "trained_features.pkl")

joblib.dump(lr_model, lr_path)
joblib.dump(rf_model, rf_path)
joblib.dump(scaler, scaler_path)
joblib.dump(features, features_path)

print("\n✅ Saved files to:", FOLDER)
print(" -", lr_path)
print(" -", rf_path)
print(" -", scaler_path)
print(" -", features_path)

# -------- 11) SAMPLE NEW PREDICTION (Scenario) --------
# You can edit these assumptions any time
new_input = {}
for f in features:
    f_low = f.lower()
    if "inflation" in f_low:
        new_input[f] = 6.5
    elif "exchange" in f_low:
        new_input[f] = 350
    elif "unemployment" in f_low:
        new_input[f] = 5.0
    elif "lending" in f_low or "interest" in f_low:
        new_input[f] = 7.5
    elif "lag" in f_low:
        new_input[f] = data[target].iloc[-1]
    else:
        new_input[f] = X[f].median()

new_df = pd.DataFrame([new_input])[features]

pred_lr = lr_model.predict(scaler.transform(new_df))[0]
pred_rf = rf_model.predict(new_df)[0]

print("\n--- Example Scenario Prediction ---")
print("Input used:\n", new_df)
print("Predicted GDP Growth (Linear Regression):", round(pred_lr, 2), "%")
print("Predicted GDP Growth (Random Forest):", round(pred_rf, 2), "%")


Reading: C:\Users\Tharindu\Desktop\New folder (6)\sri_lanka_economy_2003_2019_engineered.csv
Loaded shape: (17, 11)
Columns: ['Year', 'GDP_Growth', 'Inflation_Deflator', 'ExchangeRate', 'Unemployment', 'LendingRate', 'ExchangeRate_pct_change', 'Inflation_change', 'Unemployment_change', 'Inflation_ma3', 'GDP_Growth_lag1']

--- EDA ---
Duplicates: 0
Missing values:
 Year                       0
GDP_Growth                 0
Inflation_Deflator         0
ExchangeRate               0
Unemployment               0
LendingRate                0
ExchangeRate_pct_change    0
Inflation_change           0
Unemployment_change        0
Inflation_ma3              0
GDP_Growth_lag1            0
dtype: int64

Missing after preprocessing:
 Year                       0
GDP_Growth                 0
Inflation_Deflator         0
ExchangeRate               0
Unemployment               0
LendingRate                0
ExchangeRate_pct_change    0
Inflation_change           0
Unemployment_change        0
Inflation