In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import logging


## Set up Logging

In [2]:
# Set up the basic configuration for logging
logging.basicConfig(
    level=logging.INFO,  # Set the threshold for the logger to the INFO level
    format='%(asctime)s - %(levelname)s - %(message)s'  # Define the format of the log messages
)
logger = logging.getLogger(__name__)  # Create a logger object with the name of the current module

### Load your data

In [3]:
logger.info("Loading train_data...")
train_data = pd.read_csv('C:\\Users\\King Betse\\Downloads\\rossmann-store-sales\\train.csv')
logger.info("Loaded train_data successfully.")
        
logger.info("Loading test_data...")
test_data = pd.read_csv('C:\\Users\\King Betse\\Downloads\\rossmann-store-sales\\test.csv')
logger.info("Loaded test_data successfully.")
        
logger.info("Loading sample_submission...")
sample_submission = pd.read_csv('C:\\Users\\King Betse\\Downloads\\rossmann-store-sales\\sample_submission.csv')
logger.info("Loaded sample_submission successfully.")
        
logger.info("Loading store...")
store = pd.read_csv('C:\\Users\\King Betse\\Downloads\\rossmann-store-sales\\store.csv')
logger.info("Loaded store successfully.")


2024-09-22 23:44:36,418 - INFO - Loading train_data...
  train_data = pd.read_csv('C:\\Users\\King Betse\\Downloads\\rossmann-store-sales\\train.csv')
2024-09-22 23:44:37,011 - INFO - Loaded train_data successfully.
2024-09-22 23:44:37,012 - INFO - Loading test_data...
2024-09-22 23:44:37,060 - INFO - Loaded test_data successfully.
2024-09-22 23:44:37,062 - INFO - Loading sample_submission...
2024-09-22 23:44:37,090 - INFO - Loaded sample_submission successfully.
2024-09-22 23:44:37,091 - INFO - Loading store...
2024-09-22 23:44:37,106 - INFO - Loaded store successfully.


## Preprocessing

### Handle Missing Values

In [4]:
train_data.fillna(method='ffill', inplace=True)  # Forward fill as an example

  train_data.fillna(method='ffill', inplace=True)  # Forward fill as an example


### Convert Date Column

In [7]:
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


###  Feature Extraction

In [12]:
train_data['Weekday'] = train_data['Date'].dt.weekday  # 0-6 (Monday-Sunday)
train_data['Weekend'] = (train_data['Weekday'] >= 5).astype(int)  # 1 if Weekend, else 0

# Define known holidays 
holiday_dates = pd.to_datetime(['2013-01-01', '2014-01-01', '2015-01-01',
                                 '2013-12-25', '2014-12-25', '2015-12-25'])

# Function to calculate days to next holiday
def days_to_next_holiday(date):
    next_holiday = holiday_dates[holiday_dates > date]
    return (next_holiday.min() - date).days if not next_holiday.empty else float('inf')

# Function to calculate days since last holiday
def days_since_last_holiday(date):
    last_holiday = holiday_dates[holiday_dates < date]
    return (date - last_holiday.max()).days if not last_holiday.empty else float('inf')

# Calculate Days To Holiday and Days After Holiday
train_data['DaysToHoliday'] = train_data['Date'].apply(days_to_next_holiday)
train_data['DaysAfterHoliday'] = train_data['Date'].apply(days_since_last_holiday)
train_data['BeginningOfMonth'] = (train_data['Date'].dt.day == 1).astype(int)
train_data['MidMonth'] = ((train_data['Date'].dt.day > 10) & (train_data['Date'].dt.day <= 20)).astype(int)
train_data['EndOfMonth'] = (train_data['Date'].dt.day == train_data['Date'].dt.days_in_month).astype(int)

# Additional Feature Extraction
train_data['Month'] = train_data['Date'].dt.month  # Month as a number (1-12)

# Define seasons
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

train_data['Season'] = train_data['Month'].apply(get_season)  # Season as a categorical variable

train_data['IsHoliday'] = train_data['Date'].isin(holiday_dates).astype(int)  # Binary indicator for holidays






### Encode Categorical Data

In [13]:
train_data['Promo'] = train_data['Promo'].astype(int)  # Ensure Promo is numeric
train_data['StateHoliday'] = train_data['StateHoliday'].map({'a': 1, 'b': 1, 'c': 1, '0': 0})  # Map holidays to binary
train_data['SchoolHoliday'] = train_data['SchoolHoliday'].astype(int)  # Ensure SchoolHoliday is numeric

### Scale the Features

In [14]:
from sklearn.preprocessing import StandardScaler

# Scale the Features
scaler = StandardScaler()
features_to_scale = ['Sales', 'Customers', 'Open', 'Weekday', 'Weekend', 
                     'DaysToHoliday', 'DaysAfterHoliday', 
                     'BeginningOfMonth', 'MidMonth', 'EndOfMonth', 
                     'Promo', 'StateHoliday', 'SchoolHoliday', 
                     'Month']  # Include Month in scaling if needed
train_data[features_to_scale] = scaler.fit_transform(train_data[features_to_scale])

# Preview the processed training data
print(train_data.head())

KeyError: "['BeginningOfMonth'] not in index"