# Regression

In [4]:
# Task 2: Regression
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Loading Data, Data Pre-processing, EDA
data = pd.read_csv("Dataset2/healthcare-dataset-stroke-data.csv")

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data['bmi'] = imputer.fit_transform(data[['bmi']])

# Step 2: Feature Engineering, Creating Train, and Test Datasets
# For simplicity, we'll drop 'id' as it's irrelevant for regression
data.drop('id', axis=1, inplace=True)

# Encoding categorical variables
data = pd.get_dummies(data)

# Splitting the data into features and target variable
X = data.drop(['stroke'], axis=1)
y = data['stroke']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Apply at least 2 algorithms for regression (Training and Testing)
# Linear Regression
lr_regressor = LinearRegression()
lr_regressor.fit(X_train, y_train)
lr_pred = lr_regressor.predict(X_test)

# Ridge Regression
ridge_regressor = Ridge(alpha=1.0)
ridge_regressor.fit(X_train, y_train)
ridge_pred = ridge_regressor.predict(X_test)

# Step 4: Generate at least 2 Evaluation Metrics on each algorithm.
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

lr_metrics = evaluate_regression(y_test, lr_pred)
ridge_metrics = evaluate_regression(y_test, ridge_pred)

# Step 5: Comparing the results.
print("Linear Regression Metrics: MSE={}, R2={}".format(*lr_metrics))
print("Ridge Regression Metrics: MSE={}, R2={}".format(*ridge_metrics))

# Step 6: Fine Tune the best algorithm (if needed).
# For fine-tuning, you can use techniques like GridSearchCV or RandomizedSearchCV.


Linear Regression Metrics: MSE=0.05155617495817212, R2=0.09526881980827506
Ridge Regression Metrics: MSE=0.051556789329261055, R2=0.09525803854529702


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the parameter distributions to sample from
param_distributions = {
    'alpha': uniform(0.001, 100)
}

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(Ridge(), param_distributions, n_iter=100, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_ridge_model = random_search.best_estimator_

# Evaluate the best model on the test set
best_ridge_pred = best_ridge_model.predict(X_test)
best_ridge_mse = mean_squared_error(y_test, best_ridge_pred)
best_ridge_r2 = r2_score(y_test, best_ridge_pred)

print("Metrics of Fine-Tuned Ridge Regression Model: MSE={}, R2={}".format(best_ridge_mse, best_ridge_r2))


Best Parameters: {'alpha': 35.67633266935893}
Metrics of Fine-Tuned Ridge Regression Model: MSE=0.05158118324690286, R2=0.0948299629963355
