In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
# Generate synthetic data with three independent variables
np.random.seed(42)
X1 = 2 * np.random.rand(100, 1)
X2 = 3 * np.random.rand(100, 1)
X3 = 4 * np.random.rand(100, 1)
y = 5 + 2 * X1 + 3 * X2 + 4 * X3 + np.random.randn(100, 1)

In [3]:
# Combine independent variables into a single array
X = np.c_[X1, X2, X3]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Fit a basic linear regression model using scikit-learn for prediction
model_sklearn = LinearRegression()
model_sklearn.fit(X_train, y_train)

In [5]:
# Print coefficients and intercept from scikit-learn model
print("Scikit-learn Coefficients:", model_sklearn.coef_)
print("Scikit-learn Intercept:", model_sklearn.intercept_)

Scikit-learn Coefficients: [[2.33201968 3.14025674 4.10630486]]
Scikit-learn Intercept: [4.51441893]


In [6]:
# Use statsmodels for detailed statistical analysis, including p-values
# Add a constant term to the independent variables matrix
X_train_with_const = sm.add_constant(X_train)

# Fit the model using statsmodels
model_sm = sm.OLS(y_train, X_train_with_const).fit()

# Print the summary which contains p-values
print(model_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.973
Model:                            OLS   Adj. R-squared:                  0.972
Method:                 Least Squares   F-statistic:                     908.3
Date:                Wed, 06 Dec 2023   Prob (F-statistic):           2.06e-59
Time:                        21:08:49   Log-Likelihood:                -98.654
No. Observations:                  80   AIC:                             205.3
Df Residuals:                      76   BIC:                             214.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.5144      0.346     13.033      0.0