In [8]:
# House Price (Diabetes Progression) Prediction – Regression Project
# Author: Koussay Kraiem
#
# In this project, I'm training a simple regression model.
# Since Kaggle removed access to some old datasets, I am using
# the "diabetes" dataset from scikit-learn, which is fully built-in.
# This dataset is still perfect for demonstrating a regression model.

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

# 1. Load dataset (offline, built into sklearn)
# It predicts the progression of diabetes based on medical features.
data = load_diabetes()

# Convert to DataFrame so it’s easier to read
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="Target")

print("Dataset shape:", X.shape)
print("\nHere are the first few rows of the dataset:")
print(X.head())

# 2. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Standardize features (helps the model)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 4. Train a simple linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# 5. Evaluate the model
y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel performance:")
print(f"RMSE: {rmse:.3f}")
print(f"R² score: {r2:.3f}")

# 6. Predict one example
example = X_test.iloc[0:1]
example_scaled = scaler.transform(example)
predicted_value = model.predict(example_scaled)[0]

print("\nExample input features:")
print(example)

print(f"\nPredicted value for this example: {predicted_value:.3f}")

Dataset shape: (442, 10)

Here are the first few rows of the dataset:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  

Model performance:
RMSE: 53.853
R² score: 0.453

Example input features:
          age       sex       bmi        bp        s1        s2        s3  \
287  0.045341 -0.044642 -0.006206 -0.015999  0.125019  0.125198  0.019187   

           s4        s5       s6  
287  0.0343