# Reference Notebook: Linear Regression (OLS, Ridge, Lasso)

This notebook provides a clean, structured implementation of linear models for shipping cost prediction.


In [None]:
## 1. Imports and Setup
import pandas as pd
import numpy as np
import glob, os, itertools
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

def adjusted_r2(r2, n, p):
    """
    Compute adjusted R squared.
    r2: R^2 value
    n: number of observations
    p: number of predictors
    """
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

In [None]:
## 2. Load Data Files

# Identify train/test CSVs in working directory
train_files = sorted(glob.glob('*_train.csv'))
test_files  = sorted(glob.glob('*_test.csv'))

def get_test_file(train_fp):
    ship = os.path.basename(train_fp).replace('_train.csv','')
    return next((f for f in test_files if ship in f), None)

In [None]:
## 3. Baseline OLS Model

#Use only `log_weight` as a simple baseline.

results = []
for train_fp in train_files:
    test_fp = get_test_file(train_fp)
    if not test_fp: continue
    ship = os.path.basename(train_fp).split('_')[0]
    df_tr = pd.read_csv(train_fp)
    df_te = pd.read_csv(test_fp)

    X_tr = df_tr[['log_weight']]; y_tr = df_tr['log_cost']
    X_te = df_te[['log_weight']]; y_te = df_te['log_cost']

    scaler = StandardScaler().fit(X_tr)
    X_tr_s, X_te_s = scaler.transform(X_tr), scaler.transform(X_te)

    model = LinearRegression().fit(X_tr_s, y_tr)
    y_pred = model.predict(X_te_s)
    mse = mean_squared_error(y_te, y_pred)
    r2  = r2_score(y_te, y_pred)

    results.append({
        'ship_method': ship,
        'model': 'OLS',
        'features': 'log_weight',
        'val_MSE': round(mse,4),
        'val_R2': round(r2,4)
    })

pd.DataFrame(results).sort_values(['ship_method'])

In [None]:
## 4. Ridge & Lasso with GridSearchCV

# Define pipeline and grid
pipe = Pipeline([('scaler', StandardScaler()), ('model', Ridge())])
param_grid = {'model__alpha': [0.001,0.01,0.1,1,10,100]}

gs_ridge = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error')

# Example on first ship_method
df_example = pd.read_csv(train_files[0])
X = df_example[['log_weight','log_volume']]; y = df_example['log_cost']
gs_ridge.fit(X, y)
print("Best Ridge alpha:", gs_ridge.best_params_, "| CV MSE:", -gs_ridge.best_score_)

# Repeat for Lasso
gs_lasso = GridSearchCV(
    Pipeline([('scaler', StandardScaler()), ('model', Lasso(max_iter=10000))]),
    param_grid, cv=5, scoring='neg_mean_squared_error')
gs_lasso.fit(X, y)
print("Best Lasso alpha:", gs_lasso.best_params_, "| CV MSE:", -gs_lasso.best_score_)


In [None]:
## 5. Feature Combination Evaluation

#Evaluate all subsets of continuous features:

df = pd.read_csv(train_files[0])
cont_feats = [c for c in df.columns if c.startswith('log_') and c!='log_cost']
base = ['log_weight']
combos = []
for k in range(len(cont_feats)):
    for combo in itertools.combinations(cont_feats[1:], k):
        combos.append(base + list(combo))

out = []
for feats in combos:
    X = df[feats]; y = df['log_cost']
    scaler = StandardScaler().fit(X)
    X_s = scaler.transform(X)
    model = LinearRegression().fit(X_s, y)
    pred = model.predict(X_s)
    out.append({'features': feats,
                'MSE': mean_squared_error(y, pred),
                'R2': r2_score(y, pred)})
pd.DataFrame(out).sort_values('MSE').head()


In [None]:
## 6. Incorporate Categorical Encoding

# Define encoding columns
ohe_cols  = ['across_state']
freq_cols = ['from_state','to_state']
te_col    = 'vendor_name'
k_smooth  = 10

# Build encoders
def build_encoders(df):
    ohe = OneHotEncoder(drop='first', sparse=False).fit(df[ohe_cols])
    freq_maps = {col: df[col].value_counts().to_dict() for col in freq_cols}
    global_mean = df['log_cost'].mean()
    agg = df.groupby(te_col)['log_cost'].agg(['mean','count'])
    te_map = {idx: (row['count']*row['mean']+k_smooth*global_mean)/(row['count']+k_smooth)
              for idx,row in agg.iterrows()}
    return ohe, freq_maps, te_map, global_mean

# Apply encoding
def encode_df(df, ohe, freq_maps, te_map, global_mean):
    ohe_df = pd.DataFrame(ohe.transform(df[ohe_cols]),
                          columns=ohe.get_feature_names_out(ohe_cols), index=df.index)
    fe_df  = pd.DataFrame({f+ '_FE': df[f].map(freq_maps[f]) for f in freq_cols}, index=df.index)
    te_df  = df[te_col].map(lambda x: te_map.get(x, global_mean)).rename('vendor_TE')
    return pd.concat([df, ohe_df, fe_df, te_df], axis=1)


In [None]:
## 7. Out-of-Time Validation

#Use the `after` dataset for time-based stability check.

df_after = pd.read_csv('Data_v8_after.csv')
# Example: apply best model and encoders to df_after
# ... (implementation as needed)


In [None]:
## 8. Save Final Results

final_df = pd.DataFrame(results)
final_df.to_csv('linear_models_reference_results.csv', index=False)
print("Results saved to linear_models_reference_results.csv")