<a href="https://colab.research.google.com/github/IshaanKaul210104/Regression-Model-Recommender/blob/main/Regression_Model_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Install Dependencies
!pip install -q scikit-learn xgboost pandas

# Import Libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing, load_diabetes, make_friedman1
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import skew
from google.colab import files

# File upload cell
def upload_csv():
    uploaded = files.upload()
    for fname in uploaded.keys():
        print(f"✅ Uploaded file: {fname}")
    return list(uploaded.keys())[0]

# Dataset preparation functions
def load_california_housing():
    data = fetch_california_housing(as_frame=True)
    df = data.frame
    return df.drop('MedHouseVal', axis=1), df['MedHouseVal']

def load_diabetes_dataset():
    data = load_diabetes(as_frame=True)
    df = data.frame
    return df.drop('target', axis=1), df['target']

def generate_friedman_dataset():
    X, y = make_friedman1(n_samples=1000, n_features=10, noise=1.0, random_state=42)
    df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
    df['target'] = y
    df.to_csv('friedman1.csv', index=False)
    print("✅ Generated Friedman1 dataset")
    return df.drop('target', axis=1), df['target']

# Dataset Analysis
def analyze_dataset(X, y):
    skews = X.apply(skew)
    avg_skew = np.mean(abs(skews))
    correlations = abs(X.corrwith(y))
    avg_corr = correlations.mean()
    multicollinearity_score = np.linalg.cond(X.corr())

    print(f"\n📈 Avg skewness: {avg_skew}")
    print(f"📈 Avg correlation with target: {avg_corr}")
    print(f"📈 Multicollinearity score: {multicollinearity_score}")

    return avg_skew, avg_corr, multicollinearity_score

# Model Recommender
def recommend_model(avg_skew, avg_corr, multicol_score):
    if multicol_score > 1000 or avg_skew > 1:
        return 'RandomForest'
    elif avg_corr > 0.5 and avg_skew < 0.5:
        return 'Linear'
    elif 0.3 < avg_corr <= 0.5:
        return 'Ridge'
    else:
        return 'RandomForest'  # fallback

# Model Trainer
def train_and_evaluate(model_name, X_train, X_test, y_train, y_test):
    models = {
        'Linear': LinearRegression(),
        'Ridge': Ridge(),
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
    }

    model = models.get(model_name, RandomForestRegressor())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    print(f"\n✅ Recommended model: {model_name}")
    print(f"\n📊 Model Evaluation:")
    print(f"R² Score: {r2}")
    print(f"RMSE: {rmse}")

# Main function to run
def run_model_on_dataset(X, y):
    avg_skew, avg_corr, multicol_score = analyze_dataset(X, y)
    model_name = recommend_model(avg_skew, avg_corr, multicol_score)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    train_and_evaluate(model_name, X_train, X_test, y_train, y_test)

# Choose Dataset
print("Choose dataset to test:")
print("1. California Housing")
print("2. Diabetes")
print("3. Friedman1 Synthetic")
print("4. Upload CSV")

choice = input("Enter choice (1/2/3/4): ")

if choice == '1':
    X, y = load_california_housing()
elif choice == '2':
    X, y = load_diabetes_dataset()
elif choice == '3':
    X, y = generate_friedman_dataset()
elif choice == '4':
    filename = upload_csv()
    df = pd.read_csv(filename)
    target_col = input("Enter target column name: ")
    X = df.drop(columns=[target_col])
    y = df[target_col]
else:
    print("❌ Invalid choice")
    exit()

# Run Model
print(f"\nDataset loaded with shape: {X.shape}")
run_model_on_dataset(X, y)


Choose dataset to test:
1. California Housing
2. Diabetes
3. Friedman1 Synthetic
4. Upload CSV
Enter choice (1/2/3/4): 2

Dataset loaded with shape: (442, 10)

📈 Avg skewness: 0.408265651725193
📈 Avg correlation with target: 0.34185669222980936
📈 Multicollinearity score: 470.0779993588374

✅ Recommended model: Ridge

📊 Model Evaluation:
R² Score: 0.41915292635986556
RMSE: 55.47446204180109
