In [3]:
import pandas as pd 
df = pd.read_csv('./data/df_clean.csv')

In [None]:
import numpy as np

def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder


df = cleaner(train, store)
X = df.drop(columns=['Sales'])
y = df.Sales

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for different preprocessing steps
numeric_features = ['Competition_Since_X_months', 'weeks_since_promo2', 'Customers', 'CompetitionDistance']
one_hot_features = ['Promo', 'Promo2','DayOfWeek',  'StoreType', 'Assortment', 'PromoInterval']


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

one_hot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('ohe', one_hot_transformer, one_hot_features)
])

Linear Regression - R-squared on test data: 0.832537777889222
Random Forest - R-squared on test data: 0.9672011136520342
Lasso Regression - R-squared on test data: 0.8325369215765772


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso

models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor()),
    ('Lasso Regression', Lasso(alpha=0.1))
]

for model_name, model in models:
    # Construct the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)  # Swap the regression model here
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Evaluate the model on the test data
    score = pipeline.score(X_test, y_test)
    print(f"{model_name} - R-squared on test data:", score)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import ExtraTreesRegressor

models = [
    ('DecisionTree Regression', DecisionTreeRegressor()),
    ('AdaBoost Regression', AdaBoostRegressor()),
    ('Stacking Regression', StackingRegressor()),
    ('Voting Regression', VotingRegressor()),
    ('ExtraTrees Regression', ExtraTreesRegressor()),
]

for model_name, model in models:
    # Construct the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)  # Swap the regression model here
    ])

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Evaluate the model on the test data
    score = pipeline.score(X_test, y_test)
    print(f"{model_name} - R-squared on test data:", score)

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor

r1 = LinearRegression()
r2 = RandomForestRegressor(n_estimators=10, random_state=1)
r3 = KNeighborsRegressor()

vote = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', vote) ]) 

pipeline.fit(X_train, y_train)

score = pipeline.score(X_test, y_test)
print(f"VotingRegressor - R-squared on test data:", score)

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor

estimators = [('lr', RidgeCV()),('svr', LinearSVR(dual="auto", random_state=42))]
stack = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10, random_state=42))

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', stack) ]) 

pipeline.fit(X_train, y_train)

score = pipeline.score(X_test, y_test)
print(f"StackingRegressor - R-squared on test data:", score)


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# # Initialize model
random_forrest = RandomForestRegressor(criterion='squared_error', random_state=42)

rf_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rf', random_forrest) ]) 


# Define the parameter grid to search through
param_grid = {
    'rf__n_estimators': [50, 100, 150],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10, 25],
}



# Create the GridSearchCV object with the random forrest pipeline as an estimator
grid_search = GridSearchCV(
    estimator=rf_pipe,
    param_grid=param_grid,
    cv=5, 
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2)


# Fit the model with the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score (neg_mean_squared_error):", best_score)