<a href="https://colab.research.google.com/github/KenDaupsey/Multivariate-Regression-Analysis/blob/main/Multivariate_Regression_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

****Multivariate Regression Analysis****

*** Multiple Linear Regression ***

In [2]:
### Import Necessary Libraries
import pandas as pd
import numpy as np

In [3]:
# Load the dataset from GitHub URL
url = "https://raw.githubusercontent.com/KenDaupsey/Simple-Linear--Regression-Analysis-Using-Python/main/hsb2%7Edata.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,id,female,race,ses,schtyp,prog,read,write,math,science,socst
0,70,male,white,low,public,general,57,52,41,47,57
1,121,female,white,middle,public,vocation,68,59,53,63,61
2,86,male,white,high,public,general,44,33,54,58,31
3,141,male,white,high,public,vocation,63,44,47,53,56
4,172,male,white,middle,public,academic,47,52,57,53,61


In [4]:
## Converting Categorical Gender Data (female) for Multiple Linear Regression Analysis
import pandas as pd
import statsmodels.api as sm

# Assuming df is your DataFrame
# Convert 'female' column to binary numeric format
df['female'] = df['female'].apply(lambda x: 1 if x == 'female' else 0)

# Multiple Linear Regression Procedure
# Define independent and dependent variables
X = df[['female', 'read', 'math', 'science', 'socst']]
X = sm.add_constant(X)  # Add a constant term for the intercept
y = df['write']

# Fit the multiple linear regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  write   R-squared:                       0.602
Model:                            OLS   Adj. R-squared:                  0.591
Method:                 Least Squares   F-statistic:                     58.60
Date:                Mon, 04 Mar 2024   Prob (F-statistic):           5.80e-37
Time:                        04:52:52   Log-Likelihood:                -641.05
No. Observations:                 200   AIC:                             1294.
Df Residuals:                     194   BIC:                             1314.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.1388      2.808      2.186      0.0

*** Comparing Linear, Lasso, and Ridge Regressions: Single vs. Multiple Variables***

In [5]:
# Import necessary libraries
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Assuming df is already defined with the dataset

# Identify and one-hot encode categorical variables
categorical_cols = ['female', 'race', 'schtyp', 'prog', 'ses']  # Add other categorical columns as needed
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Separate dependent and independent variables
X = df_encoded.drop(['id', 'write'], axis=1)  # Independent variables
y = df_encoded['write']  # Dependent variable

# Add a constant term for statsmodels
X_with_const = sm.add_constant(X)

# Function to fit and print the summary
def fit_and_print_summary(model, X, y, name):
    result = model.fit()
    print(f"{name} Summary:")
    print(result.summary().tables[1])
    print()

# 1. Simple Linear Regression
linear_model = LinearRegression()
linear_model.fit(X[['read']], y)
fit_and_print_summary(sm.OLS(y, sm.add_constant(X[['read']])), X[['read']], y, "1. Simple Linear Regression")

# 2. Multiple Linear Regression
multiple_linear_model = LinearRegression()
multiple_linear_model.fit(X_with_const, y)
fit_and_print_summary(sm.OLS(y, X_with_const), X_with_const, y, "2. Multiple Linear Regression")

# 3. Simple Lasso Regression
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X[['read']], y)
fit_and_print_summary(sm.OLS(y, sm.add_constant(X[['read']])), X[['read']], y, "3. Simple Lasso Regression")

# 4. Multiple regression with Lasso
lasso_model_multiple = Lasso(alpha=1.0)
lasso_model_multiple.fit(X_with_const, y)
fit_and_print_summary(sm.OLS(y, X_with_const), X_with_const, y, "4. Multiple Regression with Lasso")

# 5. Simple Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X[['read']], y)
fit_and_print_summary(sm.OLS(y, sm.add_constant(X[['read']])), X[['read']], y, "5. Simple Ridge Regression")

# 6. Multiple regression with Ridge
ridge_model_multiple = Ridge(alpha=1.0)
ridge_model_multiple.fit(X_with_const, y)
fit_and_print_summary(sm.OLS(y, X_with_const), X_with_const, y, "6. Multiple Regression with Ridge")

# Display accuracy scores
y_pred_linear = linear_model.predict(X[['read']])
y_pred_multiple_linear = multiple_linear_model.predict(X_with_const)
y_pred_lasso = lasso_model.predict(X[['read']])
y_pred_lasso_multiple = lasso_model_multiple.predict(X_with_const)
y_pred_ridge = ridge_model.predict(X[['read']])
y_pred_ridge_multiple = ridge_model_multiple.predict(X_with_const)

# Calculate accuracy metrics for each model
mae_linear = mean_absolute_error(y, y_pred_linear)
mae_multiple_linear = mean_absolute_error(y, y_pred_multiple_linear)
mae_lasso = mean_absolute_error(y, y_pred_lasso)
mae_lasso_multiple = mean_absolute_error(y, y_pred_lasso_multiple)
mae_ridge = mean_absolute_error(y, y_pred_ridge)
mae_ridge_multiple = mean_absolute_error(y, y_pred_ridge_multiple)

mse_linear = mean_squared_error(y, y_pred_linear)
mse_multiple_linear = mean_squared_error(y, y_pred_multiple_linear)
mse_lasso = mean_squared_error(y, y_pred_lasso)
mse_lasso_multiple = mean_squared_error(y, y_pred_lasso_multiple)
mse_ridge = mean_squared_error(y, y_pred_ridge)
mse_ridge_multiple = mean_squared_error(y, y_pred_ridge_multiple)

r2_linear = r2_score(y, y_pred_linear)
r2_multiple_linear = r2_score(y, y_pred_multiple_linear)
r2_lasso = r2_score(y, y_pred_lasso)
r2_lasso_multiple = r2_score(y, y_pred_lasso_multiple)
r2_ridge = r2_score(y, y_pred_ridge)
r2_ridge_multiple = r2_score(y, y_pred_ridge_multiple)

print(f"MAE for Simple Linear Regression: {mae_linear:.4f}")
print(f"MAE for Multiple Linear Regression: {mae_multiple_linear:.4f}")
print(f"MAE for Simple Lasso Regression: {mae_lasso:.4f}")
print(f"MAE for Multiple Lasso Regression: {mae_lasso_multiple:.4f}")
print(f"MAE for Simple Ridge Regression: {mae_ridge:.4f}")
print(f"MAE for Multiple Ridge Regression: {mae_ridge_multiple:.4f}")
print()
print(f"MSE for Simple Linear Regression: {mse_linear:.4f}")
print(f"MSE for Multiple Linear Regression: {mse_multiple_linear:.4f}")
print(f"MSE for Simple Lasso Regression: {mse_lasso:.4f}")
print(f"MSE for Multiple Lasso Regression: {mse_lasso_multiple:.4f}")
print(f"MSE for Simple Ridge Regression: {mse_ridge:.4f}")
print(f"MSE for Multiple Ridge Regression: {mse_ridge_multiple:.4f}")
print()
print(f"R-squared for Simple Linear Regression: {r2_linear:.4f}")
print(f"R-squared for Multiple Linear Regression: {r2_multiple_linear:.4f}")
print(f"R-squared for Simple Lasso Regression: {r2_lasso:.4f}")
print(f"R-squared for Multiple Lasso Regression: {r2_lasso_multiple:.4f}")
print(f"R-squared for Simple Ridge Regression: {r2_ridge:.4f}")
print(f"R-squared for Multiple Ridge Regression: {r2_ridge_multiple:.4f}")

1. Simple Linear Regression Summary:
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         23.9594      2.806      8.539      0.000      18.426      29.492
read           0.5517      0.053     10.465      0.000       0.448       0.656

2. Multiple Linear Regression Summary:
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            10.3224      3.968      2.601      0.010       2.494      18.151
read              0.1162      0.065      1.778      0.077      -0.013       0.245
math              0.1745      0.071      2.447      0.015       0.034       0.315
science           0.2556      0.065      3.943      0.000       0.128       0.383
socst             0.2292      0.056      4.125      0.000       0.120       0.339
female_1          5.0987      0.8

*** Regression and Variance Inflation Factor (VIF)***

In [6]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset
# Assuming df is already defined with the dataset

# Identify and one-hot encode categorical variables
categorical_cols = ['race', 'ses', 'schtyp', 'prog', 'female']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define the dependent variable (y) and independent variables (X)
# Assuming 'math' is the dependent variable, adjust as necessary
y = df['math']
X = df_encoded.drop(['math'], axis=1)  # Drop the dependent variable from X

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the multiple regression model
model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

# Calculate the VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

                            OLS Regression Results                            
Dep. Variable:                   math   R-squared:                       0.608
Model:                            OLS   Adj. R-squared:                  0.578
Method:                 Least Squares   F-statistic:                     20.48
Date:                Mon, 04 Mar 2024   Prob (F-statistic):           1.41e-30
Time:                        04:56:20   Log-Likelihood:                -637.14
No. Observations:                 200   AIC:                             1304.
Df Residuals:                     185   BIC:                             1354.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            13.1342      4.347      3.022