In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.exceptions import ConvergenceWarning
import warnings

# === Load Data ===
train_df = pd.read_csv("train-xy.csv")
test_df = pd.read_csv("test-x.csv")

X_train = train_df.iloc[:, 1:].values
y_train = train_df.iloc[:, 0].values
X_test = test_df.values

# === Silence convergence warnings ===
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# === Standard pipeline: impute → scale → model ===
def make_pipeline(model):
    return Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),  # handles NaNs
        ('scaler', StandardScaler()),
        ('model', model)
    ])

# === Regression models ===
models = {
    "Ridge": RidgeCV(alphas=np.logspace(-3, 3, 10), cv=5),
    "Lasso": LassoCV(alphas=np.logspace(-3, 1, 30), cv=5, max_iter=50000, n_jobs=-1),
    "ElasticNet": ElasticNetCV(
        l1_ratio=[0.1, 0.5, 0.9, 0.95, 1],
        alphas=np.logspace(-3, 1, 30),
        cv=5,
        max_iter=50000,
        n_jobs=-1
    )
}

# === Fit and evaluate ===
best_score = -np.inf
best_model = None

for name, model in models.items():
    try:
        pipeline = make_pipeline(model)
        scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
        mean_score = scores.mean()
        print(f"{name} Mean CV R²: {mean_score:.4f}")
        
        if mean_score > best_score:
            best_score = mean_score
            best_model = pipeline
            best_model_name = name
    except Exception as e:
        print(f"{name} failed: {e}")

# === Final model prediction ===
if best_model:
    print(f"\nTraining final model: {best_model_name}")
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)

    # Save predictions
    student_id = "A0000000A"  # Change to your real student number
    output = pd.DataFrame({"Y": predictions})
    output.to_csv(f"{student_id}.csv", index=False)
    print(f"Predictions saved to {student_id}.csv")
else:
    print("All models failed again. Let’s troubleshoot further.")

Ridge Mean CV R²: 0.3763
Lasso Mean CV R²: 0.4976
ElasticNet Mean CV R²: 0.4976

Training final model: Lasso
Predictions saved to A0000000A.csv


In [14]:
import pandas as pd

pred_df = pd.read_csv("A0000000A.csv")  # Replace with your actual file name

assert pred_df.shape == (10000, 1), "Prediction file should have 10,000 rows and 1 column"
assert pred_df.columns[0] == "Y", "Column name must be 'Y'"
assert not pred_df.isnull().values.any(), "There should be no missing values"

print("✅ Your file format is correct!")

✅ Your file format is correct!
