<a href="https://colab.research.google.com/github/Geetanshi-jain/DSAssignmentByGeetanshijain/blob/main/regressionModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Sample data for demonstration
# Assume we have features: constant, credit card usage, and days since purchase
X = np.array([[1, 0, 300], [1, 1, 250], [1, 0, 400], [1, 1, 150]])  # Feature data
y = np.array([500, 450, 600, 300])  # Target data (Sales per Visit)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train a model (example: Linear Regression)
model02 = LinearRegression()
model02.fit(X_train, y_train)

# Define the customer data and make a prediction
cust1 = np.array([[1, 0, 333]])  # constant = 1, CC = 0, Days = 333
predicted_sales = model02.predict(cust1)
print(f"Predicted sales for customer 1: ${predicted_sales[0]:.2f}")

# Predict on the test set and calculate mean absolute error (MAE)
ypred = model02.predict(X_test)
mae = mean_absolute_error(y_true=y_test, y_pred=ypred)

print(f"The MAE value is ${mae:.2f}, representing the average error between predicted and actual values.")


Predicted sales for customer 1: $533.00
The MAE value is $25.00, representing the average error between predicted and actual values.


In [5]:

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error

def stepwise_selection(X, y,
                       initial_list=[],
                       threshold_in=0.05,
                       threshold_out=0.05,
                       verbose=True):
    """Perform a forward-backward feature selection based on p-values."""
    included = list(initial_list)
    while True:
        changed = False

        # Forward Step
        excluded = list(set(X.columns) - set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed = True
            if verbose:
                print(f'Add  {best_feature} with p-value {best_pval}')

        # Backward Step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        pvalues = model.pvalues.iloc[1:]  # all p-values except for the intercept
        worst_pval = pvalues.max()  # null if pvalues is empty
        if worst_pval > threshold_out:
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            changed = True
            if verbose:
                print(f'Remove {worst_feature} with p-value {worst_pval}')

        if not changed:
            break

    return included



In [7]:
# Sample data
data = pd.DataFrame({
    'Sales': [500, 450, 600, 300],
    'Days': [300, 250, 400, 150],
    'CreditCard': [0, 1, 0, 1],
    'Web': [1, 0, 1, 0]
})

X = data[['Days', 'CreditCard', 'Web']]
y = data['Sales']

# Run stepwise selection
selected_features = stepwise_selection(X, y)

# Final model using selected features
model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
print(model.summary())


Add  Days with p-value 0.0072054464084597755
                            OLS Regression Results                            
Dep. Variable:                  Sales   R-squared:                       0.986
Model:                            OLS   Adj. R-squared:                  0.978
Method:                 Least Squares   F-statistic:                     137.3
Date:                Tue, 05 Nov 2024   Prob (F-statistic):            0.00721
Time:                        17:15:29   Log-Likelihood:                -15.927
No. Observations:                   4   AIC:                             35.85
Df Residuals:                       2   BIC:                             34.63
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const  

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [9]:
#Example of Comparing Models

#To evaluate the effectiveness of your model,
#compare the Mean Absolute Error (MAE) of your regression model to a baseline.
# Example Baseline model - predict the mean of y for all observations
baseline_pred = [y.mean()] * len(y)
baseline_mae = mean_absolute_error(y, baseline_pred)

# Prediction with the regression model
y_pred = model.predict(sm.add_constant(X[selected_features]))
model_mae = mean_absolute_error(y, y_pred)

print(f"Baseline MAE: ${baseline_mae:.2f}")
print(f"Regression Model MAE: ${model_mae:.2f}")

# Comparison
if model_mae < baseline_mae:
    print("The regression model performs better than the baseline.")
else:
    print("The baseline model performs better than the regression model.")


Baseline MAE: $87.50
Regression Model MAE: $12.50
The regression model performs better than the baseline.


In [10]:
# Example Baseline model - predict the mean of y for all observations
baseline_pred = [y.mean()] * len(y)
baseline_mae = mean_absolute_error(y, baseline_pred)

# Prediction with the regression model
y_pred = model.predict(sm.add_constant(X[selected_features]))
model_mae = mean_absolute_error(y, y_pred)

print(f"Baseline MAE: ${baseline_mae:.2f}")
print(f"Regression Model MAE: ${model_mae:.2f}")

# Comparison
if model_mae < baseline_mae:
    print("The regression model performs better than the baseline.")
else:
    print("The baseline model performs better than the regression model.")


Baseline MAE: $87.50
Regression Model MAE: $12.50
The regression model performs better than the baseline.
