<a href="https://colab.research.google.com/github/IshaanKaul210104/ML-Model-Recommender/blob/main/ML_Model_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Cell 1: Import Libraries**

In [20]:
# ---------------------- Imports -----------------------
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing, load_diabetes, make_friedman1
from sklearn.datasets import load_iris, load_wine, make_classification, make_blobs
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import NotFittedError
from scipy.stats import skew
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

**Cell 2: Meta-Feature Calculator**

In [21]:
# ---------------------- Meta-Feature Calculation -----------------------
def calculate_meta_features(X, y=None, task='regression'):
    meta = {}
    meta['avg_skewness'] = np.mean(np.abs(skew(X)))

    if task in ['regression', 'classification'] and y is not None:
        correlations = []
        for col in X.columns:
            try:
                correlations.append(np.corrcoef(X[col], y)[0, 1])
            except:
                correlations.append(0)
        meta['avg_correlation_with_target'] = np.nanmean(np.abs(correlations))
    else:
        meta['avg_correlation_with_target'] = 0

    vif_matrix = np.linalg.pinv(np.corrcoef(X.T))  # pseudo-inverse to avoid errors
    meta['multicollinearity_score'] = np.trace(vif_matrix)

    return meta

**Cell 3: Model Recommender**

In [22]:
# ---------------------- Model Recommender -----------------------
def recommend_model(meta, task='regression'):
    if task == 'regression':
        if meta['avg_skewness'] > 2 or meta['multicollinearity_score'] > 50:
            return RandomForestRegressor()
        elif meta['avg_correlation_with_target'] > 0.5:
            return Ridge()
        else:
            return LinearRegression()

    elif task == 'classification':
        if meta['multicollinearity_score'] > 50:
            return RandomForestClassifier()
        elif meta['avg_correlation_with_target'] > 0.4:
            return LogisticRegression()
        else:
            return RandomForestClassifier()

    elif task == 'clustering':
        if meta['avg_skewness'] > 2:
            return DBSCAN()
        else:
            return KMeans(n_clusters=3, random_state=42)

**Cell 4: Model Trainer and Evaluator**

In [23]:
# ---------------------- Training & Evaluation -----------------------
def train_and_evaluate(model, X, y=None, task='regression'):
    if task == 'regression':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("\n📊 Model Evaluation:")
        print("R² Score:", r2_score(y_test, y_pred))
        print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

    elif task == 'classification':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print("\n📊 Model Evaluation:")
        print("Accuracy:", accuracy_score(y_test, y_pred))

    elif task == 'clustering':
        model.fit(X)
        labels = model.labels_ if hasattr(model, 'labels_') else model.predict(X)
        score = silhouette_score(X, labels)
        print("\n📊 Clustering Evaluation:")
        print("Silhouette Score:", score)

**Cell 5: Dataset Loader**

In [24]:
# ---------------------- Built-in Dataset Loader -----------------------
def load_builtin_dataset(task_type):
    if task_type == "regression":
        print("Available datasets: 1. California Housing  2. Diabetes  3. Friedman1")
        choice = input("Select (1/2/3): ")
        if choice == "1":
            data = fetch_california_housing(as_frame=True).frame
        elif choice == "2":
            data = load_diabetes(as_frame=True).frame
        elif choice == "3":
            X, y = make_friedman1(n_samples=1000, n_features=10, noise=1.0, random_state=42)
            data = pd.DataFrame(X, columns=[f"X{i}" for i in range(X.shape[1])])
            data['target'] = y
        else:
            raise ValueError("Invalid selection.")

    elif task_type == "classification":
        print("Available datasets: 1. Iris  2. Wine  3. Make Classification")
        choice = input("Select (1/2/3): ")
        if choice == "1":
            data = load_iris(as_frame=True).frame
        elif choice == "2":
            data = load_wine(as_frame=True).frame
        elif choice == "3":
            X, y = make_classification(n_samples=1000, n_features=10, n_classes=3, random_state=42)
            data = pd.DataFrame(X, columns=[f"X{i}" for i in range(X.shape[1])])
            data['target'] = y
        else:
            raise ValueError("Invalid selection.")

    elif task_type == "clustering":
        print("Available datasets: 1. Blobs")
        choice = input("Select (1): ")
        if choice == "1":
            X, _ = make_blobs(n_samples=500, n_features=5, centers=3, random_state=42)
            data = pd.DataFrame(X, columns=[f"X{i}" for i in range(X.shape[1])])
        else:
            raise ValueError("Invalid selection.")

    return data

**Cell 6: Optional File Upload for Google Colab**

In [25]:
# ---------------------- Upload Dataset (Optional) -----------------------
def upload_dataset():
    from google.colab import files
    uploaded = files.upload()
    for fname in uploaded.keys():
        if fname.endswith('.csv'):
            return pd.read_csv(fname)
        elif fname.endswith('.xlsx'):
            return pd.read_excel(fname)
        else:
            raise ValueError("Unsupported file format.")

**Cell 7: Main Pipeline**

In [26]:
# ---------------------- Main Execution -----------------------
task_type = input("What type of task? (regression/classification/clustering): ").strip().lower()

print("\nChoose dataset type:\n1. Use built-in dataset\n2. Upload your own dataset")
dataset_choice = input("Enter 1 or 2: ").strip()

if dataset_choice == "1":
    data = load_builtin_dataset(task_type)
elif dataset_choice == "2":
    data = upload_dataset()
else:
    raise ValueError("Invalid dataset choice.")

print(f"\nDataset loaded with shape: {data.shape}\n")
print("Columns:", list(data.columns))

if task_type in ['regression', 'classification']:
    target_col = input("Enter the name of the target column: ").strip()
    y = data[target_col]
    X = data.drop(columns=[target_col])
else:
    X = data
    y = None

# Keep only numerical columns and drop missing values
X = X.select_dtypes(include=[np.number]).dropna()
if y is not None:
    y = y.loc[X.index]

# Standardize the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Calculate meta-features
meta_features = calculate_meta_features(X_scaled, y, task=task_type)
print("Avg skewness:", meta_features['avg_skewness'])
print("Avg correlation with target:", meta_features['avg_correlation_with_target'])
print("Multicollinearity score:", meta_features['multicollinearity_score'])

# Recommend and evaluate model
model = recommend_model(meta_features, task=task_type)
print("\n✅ Recommended model:", model.__class__.__name__)
train_and_evaluate(model, X_scaled, y, task=task_type)

What type of task? (regression/classification/clustering): regression

Choose dataset type:
1. Use built-in dataset
2. Upload your own dataset
Enter 1 or 2: 1
Available datasets: 1. California Housing  2. Diabetes  3. Friedman1
Select (1/2/3): 2

Dataset loaded with shape: (442, 11)

Columns: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6', 'target']
Enter the name of the target column: target
Avg skewness: 0.408265651725193
Avg correlation with target: 0.34185669222980936
Multicollinearity score: 139.7138548950911

✅ Recommended model: RandomForestRegressor

📊 Model Evaluation:
R² Score: 0.4265944023756
Mean Squared Error: 3037.9898696629216
