#IMPORTING THE HEADERS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

#LOAD THE TRAINING.CSV FILE

In [2]:
df_file = pd.read_csv("training.csv")

print("=== Training Data Preview ===")
print(df_file.head(), "\n")
print("=== Training Data Info ===")
print(df_file.info(), "\n")
print("=== Summary Statistics ===")
print(df_file.describe(), "\n")

df_file.dtypes


=== Training Data Preview ===
   Unnamed: 0  store_ID  day_of_week        date  nb_customers_on_day  open  \
0      425390       366            4  2013-04-18                  517     1   
1      291687       394            6  2015-04-11                  694     1   
2      411278       807            4  2013-08-29                  970     1   
3      664714       802            2  2013-05-28                  473     1   
4      540835       726            4  2013-10-10                 1068     1   

   promotion state_holiday  school_holiday  sales  
0          0             0               0   4422  
1          0             0               0   8297  
2          1             0               0   9729  
3          1             0               0   6513  
4          1             0               0  10882   

=== Training Data Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   

Unnamed: 0              int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

#FINDING THE NULLS AND MISSING VALUES

In [3]:
df_file.isna().sum()

Unnamed: 0             0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64

In [4]:
df_file.isnull().sum()

Unnamed: 0             0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64

In [5]:
#Clean and normalize data: 

# 1) Drop obvious junk columns (like "Unnamed: 0")
df_file = df_file.loc[:, ~df_file.columns.str.contains(r'^Unnamed')]

# 2) Parse dates
df_file['date'] = pd.to_datetime(df_file['date'], errors='coerce')

# Normalize text, handle weird values, then encode
sh = (df_file['state_holiday']
        .astype(str).str.strip().str.lower()
        .replace({'false':'0', 'none':'0', 'nan':'0'}))

# Option A: keep as a tidy categorical with known levels
df_file['state_holiday'] = pd.Categorical(sh, categories=['0','a','b','c'])

print(df_file.describe())



            store_ID    day_of_week                           date  \
count  640840.000000  640840.000000                         640840   
mean      558.211348       4.000189  2014-04-11 02:05:13.957931008   
min         1.000000       1.000000            2013-01-01 00:00:00   
25%       280.000000       2.000000            2013-08-17 00:00:00   
50%       558.000000       4.000000            2014-04-02 00:00:00   
75%       837.000000       6.000000            2014-12-12 00:00:00   
max      1115.000000       7.000000            2015-07-31 00:00:00   
std       321.878521       1.996478                            NaN   

       nb_customers_on_day           open      promotion  school_holiday  \
count        640840.000000  640840.000000  640840.000000   640840.000000   
mean            633.398577       0.830185       0.381718        0.178472   
min               0.000000       0.000000       0.000000        0.000000   
25%             405.000000       1.000000       0.000000        0

In [6]:

# Example: drop irrelevant columns if present

df_file.dropna(subset=["date"], inplace=True)
df_file.isna().sum()
#df_file.drop(columns=["store_ID"], inplace=True, errors='ignore')
#df_file.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')



store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64

# 2. Separate features and target

In [7]:
# Assuming the last column is the target variable 'sales'

X = df_file.drop("sales", axis=1)  # All columns except last
y = df_file["sales"]     # Last column as target



# Feature Scaling

In [8]:

scaler = StandardScaler()

# Select only numeric columns (excluding 'date')
X_numeric = X.drop(columns=['date', 'state_holiday'])

# Fit and transform the numeric data
X_scaled = scaler.fit_transform(X_numeric)

# create a DataFrame with the same column names
X_scaled_df = pd.DataFrame(X_scaled, columns=X_numeric.columns)

# If you still want to keep the date column, you can reattach it:
X_scaled_df = pd.concat([X[['date', 'state_holiday']].reset_index(drop=True), X_scaled_df], axis=1)


# Train/Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

#LinearRegression model creation

In [10]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


In [11]:
coefficients = model.coef_
intercept = model.intercept_

print("Intercept:", intercept)
print("\nCoefficients:", coefficients)

Intercept: 5778.2247951062445

Coefficients: [ -46.57158123 -111.95467981 2823.08862825  627.34964717  620.87969028
    6.77278655]


#Evaluation

In [12]:
from sklearn.metrics import mean_absolute_error
# Evaluate
print("Train/Test Split Evaluation:")
print("MSE:", mean_squared_error(y_test, y_pred))

print("MAE:", mean_absolute_error(y_test, y_pred))

print("R2:", r2_score(y_test, y_pred))

Train/Test Split Evaluation:
MSE: 2216130.38246623
MAE: 992.2926713242832
R2: 0.8500756894871744


'K-FOLD CROSS VALIDATION MODULE CHECK

In [13]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = -cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
r2_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2')

print("\nK-Fold Cross Validation (5 folds):")
print("Average MSE:", mse_scores.mean())
print("Average R2:", r2_scores.mean())

  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_



K-Fold Cross Validation (5 folds):
Average MSE: 2191760.3374021566
Average R2: 0.8522335896131737


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


#Validating with the REAL_DATA

In [34]:
df_real_file = pd.read_csv("REAL_DATA.csv")

df_real_file.dropna(subset=["date"], inplace=True)
df_real_file.isna().sum()

#Clean and normalize data: 

# 1) Drop obvious junk columns (like "Unnamed: 0")
df_real_file = df_real_file.loc[:, ~df_real_file.columns.str.contains(r'^Unnamed')]

# 2) Parse dates

print(df_real_file.isna().sum())

df_real_file['date'] = pd.to_datetime(df_real_file['date'], errors='coerce')
df_real_file.dropna(subset=["date"], inplace=True)
print(df_real_file.isna().sum())

# Normalize text, handle weird values, then encode
sh = (df_real_file['state_holiday']
        .astype(str).str.strip().str.lower()
        .replace({'false':'0', 'none':'0', 'nan':'0'}))

# Option A: keep as a tidy categorical with known levels
df_real_file['state_holiday'] = pd.Categorical(sh, categories=['0','a','b','c'])

print(df_real_file.describe())



index                  0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
dtype: int64
index                  0
store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
dtype: int64
               index      store_ID   day_of_week  \
count   28387.000000  28387.000000  28387.000000   
mean   355927.439145    559.510234      3.992990   
min        56.000000      1.000000      1.000000   
25%    177538.000000    282.000000      2.000000   
50%    355264.000000    558.000000      4.000000   
75%    535662.500000    839.500000      6.000000   
max    711965.000000   1115.000000      7.000000   
std    206016.278028    321.677601      1.999679   

                                date  nb_customers_on_d

In [44]:
X_real = df_real_file

In [48]:

scaler = StandardScaler()

# Select only numeric columns (excluding 'date')
X_real_numeric = X_real.drop(columns=['date', 'state_holiday'])

# Fit and transform the numeric data
X_real_scaled = scaler.fit_transform(X_real_numeric)

# create a DataFrame with the same column names
X_real_scaled_df = pd.DataFrame(X_real_scaled, columns=X_real_numeric.columns)

# If you still want to keep the date column, you can reattach it:
X_real_scaled_df = pd.concat([X_real[['date', 'state_holiday']].reset_index(drop=True), X_real_scaled_df], axis=1)

print(pd.DataFrame(X_real).describe())






               index      store_ID   day_of_week  \
count   28387.000000  28387.000000  28387.000000   
mean   355927.439145    559.510234      3.992990   
min        56.000000      1.000000      1.000000   
25%    177538.000000    282.000000      2.000000   
50%    355264.000000    558.000000      4.000000   
75%    535662.500000    839.500000      6.000000   
max    711965.000000   1115.000000      7.000000   
std    206016.278028    321.677601      1.999679   

                                date  nb_customers_on_day          open  \
count                          28387         28387.000000  28387.000000   
mean   2014-04-22 17:55:31.398175232           630.668933      0.817346   
min              2013-01-01 00:00:00             0.000000      0.000000   
25%              2013-08-08 00:00:00           394.000000      1.000000   
50%              2014-04-05 00:00:00           613.000000      1.000000   
75%              2014-12-06 00:00:00           844.000000      1.000000   
max   

In [None]:
df_real_file["sales"] = model.predict(X_real_scaled)


df_real_file.isna().sum()

ValueError: X has 7 features, but LinearRegression is expecting 6 features as input.