## Import Libraries and Configure Logging

In [1]:
import logging
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Get the current working directory
current_dir = os.getcwd()

# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")
# Set up logging
logging.basicConfig(
    filename='preprocessing.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
from scripts.Data_preprocessing_pipeline import DataPreprocessingPipeline

import pandas as pd
import logging
from data_pipeline import DataPreprocessingPipeline

# Set up logging
logging.basicConfig(
    filename='preprocessing.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

##  Load Datasets

In [2]:
# Data loading our data is in CSV format
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
store_df = pd.read_csv('../data/store.csv')


 ## Initialize the Pipeline

In [3]:
# Initialize the pipeline with loaded data
pipeline = DataPreprocessingPipeline(train_df, test_df, store_df)

## Run the Pipeline

In [4]:
# Merge store data
train_merged, test_merged = pipeline.merge_store_data()

# Add date features
train_with_dates, test_with_dates = pipeline.add_date_features(train_merged, test_merged)

# Handle missing data
train_cleaned, test_cleaned = pipeline.handle_missing_data(train_with_dates, test_with_dates)

# Encode categorical values
train_encoded, test_encoded = pipeline.encode_categorical_values(train_cleaned, test_cleaned)

# Detect and handle outliers
train_final, test_final = pipeline.detect_outliers(train_encoded, test_encoded)


In [7]:
common_columns = set(train_df.columns).intersection(store_df.columns)
print("Common Columns:", common_columns)


Common Columns: {'Store'}


In [8]:
store_df =store_df.drop(columns=['CompetitionDistance', 'Assortment', 'StoreType', 'Promo2', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], errors='ignore')


In [9]:
sstore_df = store_df.rename(columns=lambda x: f"store_{x}" if x in common_columns else x)


In [10]:
train_df = pd.merge(train_df, store_df, how='left', on='Store', suffixes=('_train', '_store'))


In [None]:
train_final, test_final = pipeline.run_pipeline(
    missing_num_strategy='mean', 
    missing_cat_strategy='mode', 
    outlier_method='zscore'
)


In [12]:
# Save the processed data
train_final.to_csv('train_preprocessed.csv', index=False)
test_final.to_csv('test_preprocessed.csv', index=False)


In [13]:
# Read and print the log file
with open('preprocessing.log', 'r') as log_file:
    print(log_file.read())


2025-01-03 21:15:19,139 - INFO - Merging store data with train and test datasets.
2025-01-03 21:15:19,358 - INFO - Adding date features.
2025-01-03 21:15:20,013 - INFO - Handling missing data
2025-01-03 21:15:20,232 - INFO - Missing values per column:
Store                        0.000000
DayOfWeek                    0.000000
Date                         0.000000
Sales                        0.000000
Customers                    0.000000
Open                         0.000000
Promo                        0.000000
SchoolHoliday                0.000000
StoreType                    0.000000
Assortment                   0.000000
CompetitionDistance          0.002597
CompetitionOpenSinceMonth    0.317878
CompetitionOpenSinceYear     0.317878
Promo2                       0.000000
Promo2SinceWeek              0.499436
Promo2SinceYear              0.499436
PromoInterval                0.499436
Weekday                      0.000000
IsWeekend                    0.000000
Day                       