In [None]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Creating a sample dataset
data = {
    'bedrooms': [3, 4, 2, 3, 4, 3, 2, 4, 3, 2],
    'bathrooms': [2, 3, 1, 2, 3, 2, 1, 3, 2, 1],
    'sqft': [2000, 2500, 1500, 1800, 2100, 1900, 1600, 2200, 2000, 1700],
    'age': [10, 5, 8, 12, 6, 9, 11, 4, 7, 13],
    'price': [300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000]
}

df = pd.DataFrame(data)

# Performing multivariate linear regression
X = df[['bedrooms', 'bathrooms', 'sqft', 'age']]
y = df['price']

X = sm.add_constant(X)  # adding a constant term to the predictor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = sm.OLS(y_train, X_train).fit()
predictions = model.predict(X_test)

# Calculate performance evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

print(model.summary())

Mean Squared Error: 299170939.6322265
R-squared: 0.5213264965884375
Mean Absolute Error: 16778.874056882545
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.940
Model:                            OLS   Adj. R-squared:                  0.894
Method:                 Least Squares   F-statistic:                     20.78
Date:                Thu, 02 Jan 2025   Prob (F-statistic):            0.00668
Time:                        04:32:50   Log-Likelihood:                -88.270
No. Observations:                   8   AIC:                             184.5
Df Residuals:                       4   BIC:                             184.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------

  res = hypotest_fun_out(*samples, **kwds)


In [None]:
import numpy as np

class MultipleLinearRegression:
    def __init__(self):
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        X = np.c_[np.ones(n_samples), X]  # Add a column of ones for the bias term

        # Calculate the coefficients using the OLS method
        self.weights = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

    def predict(self, X):
        n_samples = X.shape[0]
        X = np.c_[np.ones(n_samples), X]  # Add a column of ones for the bias term
        return X.dot(self.weights)

# Sample dataset
X = np.array([[3, 2, 2000, 10],
              [4, 3, 2500, 5],
              [2, 1, 1500, 8],
              [3, 2, 1800, 12],
              [4, 3, 2100, 6],
              [3, 2, 1900, 9],
              [2, 1, 1600, 11],
              [4, 3, 2200, 4],
              [3, 2, 2000, 7],
              [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model = MultipleLinearRegression()
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = np.mean((y_test - predictions) ** 2)
r2 = 1 - (np.sum((y_test - predictions) ** 2) / np.sum((y_test - np.mean(y_test)) ** 2))
mae = np.mean(np.abs(y_test - predictions))

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 34913145236.64707
R-squared: -54.861032378635315
Mean Absolute Error: 174002.22479623315


In [None]:
#Here's an updated version of your code that includes regularization using Ridge regression to prevent overfitting and improve the model's generalization
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split

class RidgeRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        model = Ridge(alpha=self.alpha)
        model.fit(X, y)
        self.weights = model.coef_
        self.bias = model.intercept_

    def predict(self, X):
        return X.dot(self.weights) + self.bias

# Sample dataset
X = np.array([[3, 2, 2000, 10],
              [4, 3, 2500, 5],
              [2, 1, 1500, 8],
              [3, 2, 1800, 12],
              [4, 3, 2100, 6],
              [3, 2, 1900, 9],
              [2, 1, 1600, 11],
              [4, 3, 2200, 4],
              [3, 2, 2000, 7],
              [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Training the model with Ridge regularization
model = RidgeRegression(alpha=1.0)  # You can adjust the alpha value for regularization
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 269464682.7406521
R-squared: 0.8669310208688138
Mean Absolute Error: 16268.75959990255


In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV

class RidgeRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        model = Ridge(alpha=self.alpha)
        model.fit(X, y)
        self.weights = model.coef_
        self.bias = model.intercept_

    def predict(self, X):
        return X.dot(self.weights) + self.bias

    def select_alpha(self, X, y, alphas):
        param_grid = {'alpha': alphas}
        grid_search = GridSearchCV(Ridge(), param_grid, cv=5)
        grid_search.fit(X, y)
        self.alpha = grid_search.best_params_['alpha']
        return self.alpha

# Sample dataset
X = np.array([[3, 2, 2000, 10], [4, 3, 2500, 5], [2, 1, 1500, 8], [3, 2, 1800, 12],
              [4, 3, 2100, 6], [3, 2, 1900, 9], [2, 1, 1600, 11], [4, 3, 2200, 4],
              [3, 2, 2000, 7], [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Initialize the RidgeRegression model
model = RidgeRegression()

# Grid search for alpha selection
alphas = [0.1, 1.0, 10.0]  # Define a list of alpha values to search over
best_alpha = model.select_alpha(X_train, y_train, alphas)

print("Best Alpha:", best_alpha)

# Fit the model with the selected alpha
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Best Alpha: 0.1
Mean Squared Error: 321540655.56954205
R-squared: 0.8412144910767694
Mean Absolute Error: 17915.37831219926




In [None]:
#To implement L1 regularization (Lasso) in Python using the provided data, you can use the Lasso regression model from scikit-learn.
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

class LassoRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        model = Lasso(alpha=self.alpha)
        model.fit(X, y)
        self.weights = model.coef_
        self.bias = model.intercept_

    def predict(self, X):
        return X.dot(self.weights) + self.bias

# Sample dataset
X = np.array([[3, 2, 2000, 10],
              [4, 3, 2500, 5],
              [2, 1, 1500, 8],
              [3, 2, 1800, 12],
              [4, 3, 2100, 6],
              [3, 2, 1900, 9],
              [2, 1, 1600, 11],
              [4, 3, 2200, 4],
              [3, 2, 2000, 7],
              [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model with Lasso regularization
model = LassoRegression(alpha=1.0)  # You can adjust the alpha value for regularization
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 299227851.8496748
R-squared: 0.5212354370405203
Mean Absolute Error: 16783.644586673705


In [None]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

class RidgeRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        model = Ridge(alpha=self.alpha)
        model.fit(X, y)
        self.weights = model.coef_
        self.bias = model.intercept_

    def predict(self, X):
        return X.dot(self.weights) + self.bias

# Sample dataset
X = np.array([[3, 2, 2000, 10],
              [4, 3, 2500, 5],
              [2, 1, 1500, 8],
              [3, 2, 1800, 12],
              [4, 3, 2100, 6],
              [3, 2, 1900, 9],
              [2, 1, 1600, 11],
              [4, 3, 2200, 4],
              [3, 2, 2000, 7],
              [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Training the model with Ridge regularization
model = RidgeRegression(alpha=1.0)  # You can adjust the alpha value for regularization
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 269464682.7406521
R-squared: 0.8669310208688138
Mean Absolute Error: 16268.75959990255


In [None]:
#we need to select the good features. So used backward elimination is needed to select good feature

import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Creating a sample dataset
data = {
    'X1': [3, 4, 2, 3, 4, 3, 2, 4, 3, 2],
    'X2': [2, 3, 1, 2, 3, 2, 1, 3, 2, 1],
    'X3': [2000, 2500, 1500, 1800, 2100, 1900, 1600, 2200, 2000, 1700],
    'X4': [10, 5, 8, 12, 6, 9, 11, 4, 7, 13],
    'y': [300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000]
}

df = pd.DataFrame(data)

# Performing multiple linear regression with backward elimination
X = df[['X1', 'X2', 'X3', 'X4']]
y = df['y']

# Adding a constant term to the predictor
X = sm.add_constant(X)

# Backward Elimination
cols = list(X.columns)
while len(cols) > 0:
    X_opt = X[cols]
    model = sm.OLS(y, X_opt).fit()
    p_values = model.pvalues
    max_p_value = p_values.drop('const').max()
    if max_p_value > 0.05:
        cols.remove(p_values.idxmax())
    else:
        break

selected_features = cols
print("Selected Features after Backward Elimination:", selected_features)

# Fit the final model
final_model = sm.OLS(y, X[selected_features]).fit()

# Make predictions
predictions = final_model.predict(X[selected_features])

# Calculate evaluation metrics
mse = mean_squared_error(y, predictions)
r2 = r2_score(y, predictions)
mae = mean_absolute_error(y, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

print(final_model.summary())

Selected Features after Backward Elimination: ['const', 'X1']
Mean Squared Error: 362333333.3333334
R-squared: 0.9068312333933316
Mean Absolute Error: 15200.000000000004
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.907
Model:                            OLS   Adj. R-squared:                  0.895
Method:                 Least Squares   F-statistic:                     77.87
Date:                Wed, 01 Jan 2025   Prob (F-statistic):           2.14e-05
Time:                        18:19:24   Log-Likelihood:                -112.73
No. Observations:                  10   AIC:                             229.5
Df Residuals:                       8   BIC:                             230.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err        

  res = hypotest_fun_out(*samples, **kwds)


In [14]:
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Sample dataset
X = np.array([[3, 2, 2000, 10],
              [4, 3, 2500, 5],
              [2, 1, 1500, 8],
              [3, 2, 1800, 12],
              [4, 3, 2100, 6],
              [3, 2, 1900, 9],
              [2, 1, 1600, 11],
              [4, 3, 2200, 4],
              [3, 2, 2000, 7],
              [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Fit the Lasso regression model with L1 regularization
alpha = 0.1  # Regularization parameter (use 0.1 then show the result)
lasso = Lasso(alpha=alpha)
lasso.fit(X_train, y_train)

# Make predictions on the test set
predictions = lasso.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print("Mean Squared Error:", mse)
print("Lasso Coefficients:", lasso.coef_)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Mean Squared Error: 333035476.44719684
Lasso Coefficients: [ 5.02783783e+04  0.00000000e+00  4.05430630e+01 -5.68871832e+03]
R-squared: 0.8355380363223719
Mean Absolute Error: 18205.73418637592


In [None]:
#grid search with L1 regularization
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV

class LassoRegression:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        model = Lasso(alpha=self.alpha)
        model.fit(X, y)
        self.weights = model.coef_
        self.bias = model.intercept_

    def predict(self, X):
        return X.dot(self.weights) + self.bias

    def select_alpha(self, X, y, alphas):
        param_grid = {'alpha': alphas}
        grid_search = GridSearchCV(Lasso(), param_grid, cv=5)
        grid_search.fit(X, y)
        self.alpha = grid_search.best_params_['alpha']
        return self.alpha

# Sample dataset
X = np.array([[3, 2, 2000, 10], [4, 3, 2500, 5], [2, 1, 1500, 8], [3, 2, 1800, 12],
              [4, 3, 2100, 6], [3, 2, 1900, 9], [2, 1, 1600, 11], [4, 3, 2200, 4],
              [3, 2, 2000, 7], [2, 1, 1700, 13]])

y = np.array([300000, 400000, 250000, 280000, 410000, 320000, 230000, 390000, 350000, 260000])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

# Initialize the LassoRegression model
model = LassoRegression()

# Grid search for alpha selection
alphas = [0.1, 1.0, 10.0]  # Define a list of alpha values to search over
best_alpha = model.select_alpha(X_train, y_train, alphas)

print("Best Alpha:", best_alpha)

# Fit the model with the selected alpha
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print("Mean Squared Error:", mse)
print("R-squared:", r2)
print("Mean Absolute Error:", mae)

Best Alpha: 0.1
Mean Squared Error: 333035476.44719684
R-squared: 0.8355380363223719
Mean Absolute Error: 18205.73418637592


