### data/preprocessing.py

### utils/logger.py

In [2]:
import logging

def setup_logging():
    logging.basicConfig(
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.INFO
    )


### data/preprocessing.py

In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

class FillMissingValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.median_competition_distance = X['competition_distance'].median()
        self.mode_competition_open_since_year = X['competition_open_since_year'].mode()[0]
        self.mode_competition_open_since_month = X['competition_open_since_month'].mode()[0]
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X['competition_distance'].fillna(self.median_competition_distance, inplace=True)
        X['competition_open_since_year'].fillna(self.mode_competition_open_since_year, inplace=True)
        X['competition_open_since_month'].fillna(self.mode_competition_open_since_month, inplace=True)
        X['promo2_since_week'].fillna(0, inplace=True)
        X['promo2_since_year'].fillna(0, inplace=True)
        X['promo_interval'].fillna(0, inplace=True)
        return X

class EncodeScaleData(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.assortment_dict = {'a': 1, 'b': 2, 'c': 3}
        self.minmax_scaler = MinMaxScaler()
        self.standard_scaler = StandardScaler()
        self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.column_transformer = None

    def fit(self, X, y=None):
        X = X.copy()
        # Fit OneHotEncoder and LabelEncoder
        self.ohe.fit(X[['state_holiday']])
        self.label_encoder.fit(X['store_type'])
        return self

    def transform(self, X, y=None):
        X = X.copy()
        # Apply OneHotEncoder to state_holiday
        state_holiday_encoded = self.ohe.transform(X[['state_holiday']])
        state_holiday_encoded_df = pd.DataFrame(state_holiday_encoded, columns=self.ohe.get_feature_names_out(['state_holiday']))

        # Apply LabelEncoder to store_type
        X['store_type'] = self.label_encoder.transform(X['store_type'])
        X['assortment'] = X['assortment'].map(self.assortment_dict)

        # Scaling numerical features
        X[['year', 'promo2_since']] = self.minmax_scaler.fit_transform(X[['year', 'promo2_since']])
        X[['competition_distance', 'competition_open']] = self.minmax_scaler.fit_transform(X[['competition_distance', 'competition_open']])
        if 'customers' in X.columns:
            X[['customers']] = self.standard_scaler.fit_transform(X[['customers']]

        # Drop original state_holiday column and concatenate encoded dataframe
        X = X.drop(columns=['state_holiday'])
        X = pd.concat([X.reset_index(drop=True), state_holiday_encoded_df.reset_index(drop=True)], axis=1)

        return X


### feature_engineering/feature_engineering.py

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class FeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        months = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                  7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
        X['is_promo'] = X.apply(lambda x: 0 if x['promo_interval'] == 0 else 1 if months[x['date'].month] in x['promo_interval'].split(",") else 0, axis=1)
        X['day'] = X['date'].dt.day
        X['month'] = X['date'].dt.month  # Ensure month column is created
        X['year'] = X['date'].dt.year
        X['week_of_year'] = X['date'].dt.isocalendar().week
        X['competition_open'] = (X['year'] - X['competition_open_since_year']) * 12 - X['competition_open_since_month'] + X['month']
        X['competition_open'] = X['competition_open'].apply(lambda x: 0 if x < 0 else x)
        promo2_conditional_difference = np.where(X['promo2_since_year'] != 0, X['year'] - X['promo2_since_year'], 0)
        X['promo2_since'] = (promo2_conditional_difference) * 52 + X['week_of_year'] - X['promo2_since_week']
        X['promo2_since'] = X['promo2_since'].apply(lambda x: max(x, 0))
        X['state_holiday'] = X['state_holiday'].astype(str)
        X['store_type'] = X['store_type'].astype(str)
        X['assortment'] = X['assortment'].astype(str)

        return X

    def get_feature_names_out(self, input_features=None):
        return self.columns_.tolist()


class CyclicalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, columns, max_values):
        self.columns = columns
        self.max_values = max_values

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        for column, max_value in zip(self.columns, self.max_values):
            X[column + '_sin'] = np.sin(2 * np.pi * X[column] / max_value)
            X[column + '_cos'] = np.cos(2 * np.pi * X[column] / max_value)
        return X.drop(columns=self.columns)


### feature_engineering/drop_columns.py

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        return X.drop(columns=self.columns)


### feature_engineering/transformations.py

In [6]:
# feature_engineering/transformations.py
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class ApplyTransformations(BaseEstimator, TransformerMixin):
    def __init__(self, small_constant=1):
        self.small_constant = small_constant

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        
        #apply sqrt transformation on customers    
        if 'customers' in X.columns:
            X['customers'] = np.sqrt(X['customers'])

        # Apply log1p transformation to 'competition_distance'
        if 'competition_distance' in X.columns:
            X['competition_distance'] = np.log1p(X['competition_distance'])

        # Apply log transformation with a small constant to handle zeros in 'competition_open' and 'promo2_since'
        if 'competition_open' in X.columns:
            X['competition_open'] = np.log1p(X['competition_open'] + self.small_constant)
        if 'promo2_since' in X.columns:
            X['promo2_since'] = np.log1p(X['promo2_since'] + self.small_constant)

        return X


### model/target_transformer.py     NOT REQUIRED

In [7]:
# model/target_transformer.py
# from sklearn.base import BaseEstimator, TransformerMixin
# import numpy as np

# class TargetTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, y):
#         return self

#     def transform(self, y):
#         return np.sqrt(y)

#     def inverse_transform(self, y):
#         return np.square(y)


### model/vif_selector.py

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np

class VIFSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=5):
        self.threshold = threshold
        self.columns_ = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        
        print("Initial number of columns:", X.shape[1])  # Debug print
        
        high_vif = True
        while high_vif:
            vif_values = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
            max_vif = max(vif_values)
            print("Current max VIF:", max_vif)  # Debug print
            if max_vif > self.threshold:
                max_vif_index = vif_values.index(max_vif)
                column_to_drop = X.columns[max_vif_index]
                print("Dropping column:", column_to_drop)  # Debug print
                X = X.drop(columns=[column_to_drop])
            else:
                high_vif = False
        self.columns_ = X.columns
        print("Final number of columns:", X.shape[1])  # Debug print
        return X

    def get_feature_names_out(self, input_features=None):
        return self.columns_.tolist()


### model/select_k_best.py

In [9]:
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.feature_selection import SelectKBest, mutual_info_regression
# import pandas as pd

# class SelectKBestFeatures(BaseEstimator, TransformerMixin):
#     def __init__(self, k='all'):
#         self.k = k
#         self.selector = SelectKBest(score_func=mutual_info_regression, k=k)

#     def fit(self, X, y):
#         self.selector.fit(X, y)
#         self.columns_ = X.columns[self.selector.get_support()]
#         return self

#     def transform(self, X, y=None):
#         return X[self.columns_]
    
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import mutual_info_regression, SelectKBest
import pandas as pd

class SelectKBestFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, k='all', threshold=0.005):
        self.k = k
        self.threshold = threshold
        self.feature_scores_ = None  # Use trailing underscore to follow sklearn convention

    def fit(self, X, y):
        self.selector = SelectKBest(score_func=mutual_info_regression, k='all')
        self.selector.fit(X, y)
        
        # Get the scores and identify features with low mutual information
        scores = self.selector.scores_
        self.feature_scores_ = pd.DataFrame({'feature': X.columns, 'score': scores})
        self.columns_ = self.feature_scores_[self.feature_scores_['score'] > self.threshold]['feature'].values
        print("Select K best features done")
        return self

    def transform(self, X, y=None):
        return X[self.columns_]

    def get_feature_names_out(self, input_features=None):
        return self.columns_

    def get_feature_scores(self):
        return self.feature_scores_


### app/pipeline.py

In [14]:
# app/pipeline.py
from sklearn.pipeline import Pipeline
# from data.preprocessing import FillMissingValues, EncodeScaleData
# from feature_engineering.feature_engineering import FeatureEngineering, CyclicalFeatures
# from feature_engineering.transformations import ApplyTransformations  # Import the new transformer
# from feature_engineering.drop_columns import DropColumns
# from model.vif_selector import VIFSelector
# from model.select_k_best import SelectKBestFeatures
# from model.inverse_transformations import InverseSalesTransformation  # Import the inverse transformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import TransformedTargetRegressor

def sqrt_transform_with_constant(y, constant=10):
    return np.sqrt(y + constant)

def inverse_sqrt_transform_with_constant(y, constant=10):
    return np.square(y) - constant

class ConvertToFloat64(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.astype('float64')

def create_pipeline():
    preprocessing_pipeline = Pipeline(steps=[
        ('fill_missing', FillMissingValues()),
        ('feature_engineering', FeatureEngineering()),
        ('cyclical_features', CyclicalFeatures(columns=['month', 'day_of_week', 'day', 'week_of_year'], max_values=[12, 7, 31, 52])),
        ('apply_transformations', ApplyTransformations()),  # Add the new transformer
        ('encode_scale', EncodeScaleData())
    ])
    
    vif_pipeline = Pipeline(steps=[
        ('drop_columns', DropColumns(columns=['date', 'competition_open_since_month', 'competition_open_since_year', 'promo2_since_week', 
                                              'promo2_since_year', 'promo_interval'])),
        ('convert_to_float64', ConvertToFloat64()),
        ('vif', VIFSelector(threshold=5))
    ])
    
    model_pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing_pipeline),
        ('vif', vif_pipeline),
        ('select_k_best', SelectKBestFeatures(k='all', threshold=0.005)),
        ('model',  RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
    ])
    
    final_pipeline = TransformedTargetRegressor(
        regressor=model_pipeline,
        func=sqrt_transform_with_constant,
        inverse_func=inverse_sqrt_transform_with_constant
    )
    
    return final_pipeline

### model/train.py

In [15]:
# import os
# import pandas as pd
# import pickle
# from datetime import datetime
# # from app.pipeline import create_pipeline

# # Function to get the CSV file path from the input directory
# def get_csv_path(input_dir='input'):
#     for file_name in os.listdir(input_dir):
#         if file_name.endswith('sales_data.csv'):
#             return os.path.join(input_dir, file_name)
#     raise FileNotFoundError("No CSV file found in the input directory.")

# # Custom date parser that tries multiple formats
# # def custom_date_parser(x):
# #     for fmt in ('%d-%m-%Y', '%Y-%m-%d'):
# #         try:
# #             return datetime.strptime(x, fmt)
# #         except ValueError:
# #             pass
# #     raise ValueError(f"no valid date format found for {x}")

# # # Function to apply the custom date parser
# # def parse_dates(date_series):
# #     return date_series.apply(custom_date_parser)

# # Specify dtypes to avoid DtypeWarning
# dtype_spec = {
#     'store': int,
#     'day_of_week': int,
#     'sales': float,
#     'customers': int,
#     'open': int,
#     'promo': int,
#     'state_holiday': str,
#     'school_holiday': int,
#     'store_type': str,
#     'assortment': str,
#     'competition_distance': float,
#     'competition_open_since_month': float,
#     'competition_open_since_year': float,
#     'promo2': int,
#     'promo2_since_week': float,
#     'promo2_since_year': float,
#     'promo_interval': str
# }

# # Train and save the pipeline
# csv_path = get_csv_path()  # Dynamically get the CSV path
# df = pd.read_csv(csv_path, dtype=dtype_spec, low_memory=False, parse_dates=['date'])

# # # Convert the date column using the custom date parser
# # df['date'] = parse_dates(df['date'])

# X = df.drop(columns=['sales'])
# y = df['sales']

### utils/utils.py

In [16]:
import pandas as pd
import numpy as np
from datetime import datetime

def get_csv_path(input_dir='input', filename='sales_data.csv'):
    import os
    for file_name in os.listdir(input_dir):
        if file_name.endswith(filename):
            return os.path.join(input_dir, file_name)
    raise FileNotFoundError(f"No CSV file found in the input directory: {input_dir}")

def custom_date_parser(x):
    for fmt in ('%d-%m-%Y', '%Y-%m-%d'):
        try:
            return datetime.strptime(x, fmt)
        except ValueError:
            pass
    raise ValueError(f"No valid date format found for {x}")

def parse_dates(date_series):
    return date_series.apply(custom_date_parser)

def save_model(pipeline, filename='model_pipeline.pkl'):
    import pickle
    with open(filename, 'wb') as f:
        pickle.dump(pipeline, f)

def load_model(filename='model_pipeline.pkl'):
    import pickle
    with open(filename, 'rb') as f:
        return pickle.load(f)

### scripts/run_training.py

In [17]:
import pandas as pd
from utils.utils import get_csv_path, save_model
from app.pipeline import create_pipeline
from utils.logger import setup_logging
import logging
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def run_training():
    setup_logging()
    logging.info("Starting training process")


        # Load the data
    csv_path = get_csv_path()
    dtype_spec = {
        'store': int,
        'day_of_week': int,
        'sales': float,
        'customers': int,
        'open': int,
        'promo': int,
        'state_holiday': str,
        'school_holiday': int,
        'store_type': str,
        'assortment': str,
        'competition_distance': float,
        'competition_open_since_month': float,
        'competition_open_since_year': float,
        'promo2': int,
        'promo2_since_week': float,
        'promo2_since_year': float,
        'promo_interval': str
    }
    df = pd.read_csv(csv_path, dtype=dtype_spec, low_memory=False, parse_dates=['date'])

    X = df.drop(columns=['sales'])
    y = df['sales']

    logging.info("Creating pipeline")
    pipeline = create_pipeline()

    logging.info("Fitting pipeline")
    pipeline.fit(X, y)
    logging.info("Pipeline fitted successfully")

    logging.info("Saving the trained model")
    save_model(pipeline)
    logging.info("Model saved successfully")

2024-07-11 00:46:23,244 - root - INFO - Starting training process
2024-07-11 00:46:25,732 - root - INFO - Creating pipeline
2024-07-11 00:46:25,733 - root - INFO - Fitting pipeline


Initial number of columns: 25
Current max VIF: 45.34305812996274
Dropping column: week_of_year_cos
Current max VIF: 42.69405553988109
Dropping column: state_holiday_0
Current max VIF: 40.23264243624885
Dropping column: week_of_year_sin
Current max VIF: 17.62879154609724
Dropping column: open
Current max VIF: 7.360765662392768
Dropping column: competition_distance
Current max VIF: 6.211694904522429
Dropping column: promo2_since
Current max VIF: 3.717989604629969
Final number of columns: 19
Select K best features done


2024-07-11 01:07:10,087 - root - INFO - Pipeline fitted successfully
2024-07-11 01:07:10,136 - root - INFO - Saving the trained model
2024-07-11 01:10:59,992 - root - INFO - Model saved successfully


In [29]:
logging.info("Making predictions")
y_pred = pipeline.predict(X)

# Post-prediction adjustment to ensure sales are never less than zero
y_pred_adjusted = np.maximum(y_pred, 0)

# Include both X and y_pred in the same DataFrame
predictions_df = X.copy()
predictions_df['predicted_sales'] = y_pred_adjusted
predictions_df['actual_sales'] = y

#defining adjusted r2
def calculate_adjusted_r2(r2, n, k):
    return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

# Calculate metrics
logging.info("Calculating performance metrics")
r2 = r2_score(y, y_pred_adjusted)
adjusted_r2 = calculate_adjusted_r2(r2, X.shape[0], X.shape[1])
mae = mean_absolute_error(y, y_pred_adjusted)
rmse = np.sqrt(mean_squared_error(y, y_pred_adjusted))

metrics = {
    'R^2': r2,
    'Adjusted R^2': adjusted_r2,
    'MAE': mae,
    'RMSE': rmse
}

logging.info("Model performance metrics calculated")
for metric, value in metrics.items():
    logging.info(f"{metric}: {value}")

# Optionally, save metrics to a file
logging.info("Saving performance metrics to file")
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('model_performance_metrics.csv', index=False)
logging.info("Performance metrics saved successfully")

#     return predictions_df

# if __name__ == "__main__":
#     predictions_df = run_training()
#     print(predictions_df.head())


2024-07-11 09:54:30,802 - root - INFO - Making predictions


Initial number of columns: 25
Current max VIF: 45.34305812996274
Dropping column: week_of_year_cos
Current max VIF: 42.69405553988109
Dropping column: state_holiday_0
Current max VIF: 40.23264243624885
Dropping column: week_of_year_sin
Current max VIF: 17.62879154609724
Dropping column: open
Current max VIF: 7.360765662392768
Dropping column: competition_distance
Current max VIF: 6.211694904522429
Dropping column: promo2_since
Current max VIF: 3.717989604629969
Final number of columns: 19


2024-07-11 10:05:12,123 - root - INFO - Calculating performance metrics
2024-07-11 10:05:12,281 - root - INFO - Model performance metrics calculated
2024-07-11 10:05:12,286 - root - INFO - R^2: 0.9975497156681291
2024-07-11 10:05:12,288 - root - INFO - Adjusted R^2: 0.9975496747172815
2024-07-11 10:05:12,289 - root - INFO - MAE: 110.7221579619695
2024-07-11 10:05:12,291 - root - INFO - RMSE: 190.5725870978322
2024-07-11 10:05:12,292 - root - INFO - Saving performance metrics to file
2024-07-11 10:05:12,417 - root - INFO - Performance metrics saved successfully


In [31]:
predictions_df['y_pred'] = y_pred

In [35]:
predictions_df[(predictions_df['predicted_sales'] < 0)]

Unnamed: 0,store,day_of_week,date,customers,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval,predicted_sales,actual_sales,y_pred


In [21]:
pipeline.named_steps

AttributeError: 'TransformedTargetRegressor' object has no attribute 'named_steps'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
correlation_matrix = predictions_df.corr()

# Plot the heatmap
plt.figure(figsize=(15, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, linecolor='black')
plt.title('Correlation Heatmap of Selected Features')
plt.show()

In [39]:
test = pd.read_csv("E://Learnbay Project/salesprediction/input/test.csv")
store = pd.read_csv("E://Learnbay Project/salesprediction/input/store.csv")
test_data = pd.merge(left= test, right= store, how= "left", on= "Store")
test_data.head(5)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,2,3,4,2015-09-17,1.0,1,0,0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
2,3,7,4,2015-09-17,1.0,1,0,0,a,c,24000.0,4.0,2013.0,0,,,
3,4,8,4,2015-09-17,1.0,1,0,0,a,a,7520.0,10.0,2014.0,0,,,
4,5,9,4,2015-09-17,1.0,1,0,0,a,c,2030.0,8.0,2000.0,0,,,


In [40]:
import inflection

snakecase = lambda x : inflection.underscore(x)
new_columns = list(map(snakecase, test_data.columns))
print("The snake_case columns after renaming will be:  \n \n", new_columns)

#Renaming the columns in snake case
test_data.columns = new_columns

The snake_case columns after renaming will be:  
 
 ['id', 'store', 'day_of_week', 'date', 'open', 'promo', 'state_holiday', 'school_holiday', 'store_type', 'assortment', 'competition_distance', 'competition_open_since_month', 'competition_open_since_year', 'promo2', 'promo2_since_week', 'promo2_since_year', 'promo_interval']


In [44]:
test_data.drop(['id'],axis =1, inplace=True)

In [45]:
test_data.head(5)

Unnamed: 0,store,day_of_week,date,open,promo,state_holiday,school_holiday,store_type,assortment,competition_distance,competition_open_since_month,competition_open_since_year,promo2,promo2_since_week,promo2_since_year,promo_interval
0,1,4,2015-09-17,1.0,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,3,4,2015-09-17,1.0,1,0,0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
2,7,4,2015-09-17,1.0,1,0,0,a,c,24000.0,4.0,2013.0,0,,,
3,8,4,2015-09-17,1.0,1,0,0,a,a,7520.0,10.0,2014.0,0,,,
4,9,4,2015-09-17,1.0,1,0,0,a,c,2030.0,8.0,2000.0,0,,,


In [46]:
test_data.to_csv("E://Learnbay Project/salesprediction/input/test_data.csv", index = False)