In [2]:
!pip -q install pandas numpy scikit-learn

In [3]:
import pandas as pd, numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

In [4]:
URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"

In [12]:
df = pd.read_csv(URL)
cols = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df = df[cols].copy()

def rmse(y_true, y_pred):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [6]:
# Q1: which column has missing values?
na_counts = df.isna().sum()
missing_col = na_counts[na_counts > 0].idxmax() if (na_counts > 0).any() else None
print("Q1 (missing column):", missing_col)

Q1 (missing column): horsepower


In [7]:
# Q2: median horsepower
print("Q2 (median horsepower):", df['horsepower'].median())

Q2 (median horsepower): 149.0


In [8]:
# splitting (as in lectures)
def split_df(base_df, seed=42):
    d = base_df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n = len(d); n_train = int(0.6*n); n_val = int(0.2*n)
    return d.iloc[:n_train].copy(), d.iloc[n_train:n_train+n_val].copy(), d.iloc[n_train+n_val:].copy()

features = ['engine_displacement','horsepower','vehicle_weight','model_year']
target = 'fuel_efficiency_mpg'

In [9]:
# Q3: fill 0 vs fill mean (mean from TRAIN only), seed=42
dtr, dvl, dte = split_df(df, seed=42)
Xtr, Xvl = dtr[features].copy(), dvl[features].copy()
ytr, yvl = dtr[target].values, dvl[target].values


In [13]:
# fill 0
Xtr0, Xvl0 = Xtr.copy().fillna(0), Xvl.copy().fillna(0)
m0 = LinearRegression().fit(Xtr0, ytr)
rmse0 = round(rmse(yvl, m0.predict(Xvl0)), 2)

# fill mean (train only)
Xtrm, Xvlm = Xtr.copy(), Xvl.copy()
for c in Xtrm.columns:
    if Xtrm[c].isna().any():
        mval = Xtrm[c].mean()
        Xtrm[c] = Xtrm[c].fillna(mval)
        Xvlm[c] = Xvlm[c].fillna(mval)
mm = LinearRegression().fit(Xtrm, ytr)
rmsem = round(rmse(yvl, mm.predict(Xvlm)), 2)

better = "With 0" if rmse0 < rmsem else ("With mean" if rmsem < rmse0 else "Both are equally good")
print(f"Q3 RMSE → fill-0: {rmse0} | fill-mean: {rmsem} | Answer: {better}")

Q3 RMSE → fill-0: 0.52 | fill-mean: 0.46 | Answer: With mean


In [14]:
# Q4: ridge sweep r in [0,0.01,0.1,1,5,10,100], fill 0
r_list = [0, 0.01, 0.1, 1, 5, 10, 100]
Xtr0, Xvl0 = dtr[features].fillna(0), dvl[features].fillna(0)
ytr, yvl = dtr[target].values, dvl[target].values

scores = []
for r in r_list:
    model = LinearRegression() if r==0 else Ridge(alpha=r, random_state=42)
    model.fit(Xtr0, ytr)
    s = round(rmse(yvl, model.predict(Xvl0)), 2)
    scores.append(s)
    print(f"r={r:<6} RMSE={s}")
best_rmse = min(scores)
best_r = r_list[scores.index(best_rmse)]
print("Q4 best r:", best_r, "with RMSE:", best_rmse)

r=0      RMSE=0.52
r=0.01   RMSE=0.52
r=0.1    RMSE=0.52
r=1      RMSE=0.52
r=5      RMSE=0.52
r=10     RMSE=0.52
r=100    RMSE=0.52
Q4 best r: 0 with RMSE: 0.52


In [15]:
# Q5: seeds 0–9, split 60/20/20, fill 0, no regularization, std of val RMSE
vals = []
for s in range(10):
    dtr, dvl, _ = split_df(df, seed=s)
    Xtr, Xvl = dtr[features].fillna(0), dvl[features].fillna(0)
    ytr, yvl = dtr[target].values, dvl[target].values
    m = LinearRegression().fit(Xtr, ytr)
    vals.append(rmse(yvl, m.predict(Xvl)))
std = round(np.std(vals), 3)
print("Q5 std across seeds 0–9:", std)


Q5 std across seeds 0–9: 0.007


In [16]:
# Q6: seed=9, train on train+val, fill 0, r=0.001, test RMSE
dtr, dvl, dte = split_df(df, seed=9)
Xtr_full = pd.concat([dtr[features], dvl[features]]).fillna(0)
ytr_full = np.concatenate([dtr[target].values, dvl[target].values])
Xte, yte = dte[features].fillna(0), dte[target].values
ridge = Ridge(alpha=0.001, random_state=9).fit(Xtr_full, ytr_full)
rmse_test = rmse(yte, ridge.predict(Xte))
print("Q6 test RMSE:", rmse_test, "→ rounded:", round(rmse_test, 3))

Q6 test RMSE: 0.5154915324831211 → rounded: 0.515


In [18]:
# --------------- Summary ----------------------
print("\n FINAL ANSWERS SUMMARY")
print(f"Q1: Missing column → {missing_col}")
print(f"Q2: Median horsepower → {df['horsepower'].median()}")
print(f"Q3: Better option → {better}")
print(f"Q4: Best r → {best_r}")
print(f"Q5: RMSE std → {std}")
print(f"Q6: Test RMSE → {round(rmse_test, 3)}")



 FINAL ANSWERS SUMMARY
Q1: Missing column → horsepower
Q2: Median horsepower → 149.0
Q3: Better option → With mean
Q4: Best r → 0
Q5: RMSE std → 0.007
Q6: Test RMSE → 0.515
