In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
X = df.drop("math score", axis=1)
y = df["math score"]

categorical_cols = X.select_dtypes(include="object").columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()

In [None]:
results_no_tuning = []
preds_no_tuning = {}

for name, model in models.items():
    pipe = Pipeline([("pre", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    preds_no_tuning[name] = preds
    results_no_tuning.append({
        "Model": name,
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds)
    })
df_no_tuning = pd.DataFrame(results_no_tuning).sort_values(by="R2", ascending=False)

print("Evaluation WITHOUT Tuning")
print(df_no_tuning)

In [None]:
tuned_models = {
    "Ridge": {
        "model": Ridge(),
        "params": {"model__alpha": [0.1, 1.0, 10.0]}
    },
    "Lasso": {
        "model": Lasso(),
        "params": {"model__alpha": [0.001, 0.01, 0.1, 1.0]}
    },
    "SVR": {
        "model": SVR(),
        "params": {"model__C": [0.1, 1, 10], "model__gamma": ["scale", "auto"]}
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {"model__n_estimators": [100, 200], "model__max_depth": [None, 10, 20]}
        },
    "GradientBoosting": {
        "model": GradientBoostingRegressor(random_state=42),
        "params": {"model__n_estimators": [100, 200], "model__learning_rate": [0.05, 0.1], "model__max_depth": [3, 5]}
    }
}

In [None]:
# Tuning (ili "fine-tuning") je proces podešavanja hiperparametara modela
# kako bi on radio što bolje na određenom skupu podataka.
results_tuned = []
best_estimators = {}

for name, mp in tuned_models.items():
    pipe = Pipeline([("pre", preprocessor), ("model", mp["model"])])
    grid = GridSearchCV(pipe, mp["params"], cv=5, scoring="r2", n_jobs=-1)
    grid.fit(X_train, y_train)
    preds = grid.predict(X_test)
    best_estimators[name] = grid.best_estimator_
    results_tuned.append({
        "Model": name,
        "MAE": mean_absolute_error(y_test, preds),
        "RMSE": np.sqrt(mean_squared_error(y_test, preds)),
        "R2": r2_score(y_test, preds),
        "Best Params": grid.best_params_
    })

df_tuned = pd.DataFrame(results_tuned).sort_values(by="R2", ascending=False)

print("\nEvaluation WITH Tuning")
print(df_tuned)

In [None]:
comparison_df = pd.merge(
    df_no_tuning,
    df_tuned.drop(columns=["Best Params"]),  
    on="Model",
    suffixes=("_NoTuning", "_Tuned")
)

comparison_df = comparison_df.sort_values(by="R2_Tuned", ascending=False)

print("Comparison of Models: No Tuning vs With Tuning\n")
print(comparison_df)

In [None]:
best_model_no_tuning_name = df_no_tuning.iloc[0]["Model"]
best_model_tuned_name = df_tuned.iloc[0]["Model"]

best_model_no_tuning = Pipeline([("pre", preprocessor), ("model", models[best_model_no_tuning_name])])
best_model_no_tuning.fit(X_train, y_train)

best_model_tuned = best_estimators[best_model_tuned_name]

preds_no = best_model_no_tuning.predict(X_test)
preds_tuned = best_model_tuned.predict(X_test)

compare_preds = pd.DataFrame({
    "Actual": y_test.values,
    f"Predicted_{best_model_no_tuning_name}_NoTuning": preds_no.round(2),
    f"Predicted_{best_model_tuned_name}_Tuned": preds_tuned.round(2)
})
print("FOR MATH SCORE...")
print(compare_preds.head(15))

In [None]:
X_train_transformed = pipe.named_steps['pre'].transform(X_train)
X_test_transformed = pipe.named_steps["pre"].transform(X_test)

encoder = pipe.named_steps['pre'].named_transformers_['cat']
cat_feature_names = encoder.get_feature_names_out(categorical_cols)


In [None]:
cat_f = [i for i in X_train if X_train[i].dtype == "object"]
encoder = pipe.named_steps['pre'].named_transformers_['cat']
cat_feature_names = encoder.get_feature_names_out(cat_f)
cols= [i for i in cat_feature_names]
cols.extend(["reading_score","writing_score"])

In [None]:
X_train_transformed = pd.DataFrame(X_train_transformed, columns=cols)

In [None]:
model = Ridge(alpha=1).fit(X_train_transformed,y_train)
model.predict(X_test_transformed)
explainer = shap.Explainer(model.predict,X_train_transformed)
shap_values = explainer(X_train_transformed)
shap.summary_plot(shap_values)
# Koja kolona ima najviši uticaj
# Što je više ka crvenom to su veće šanse da će imati veći math score