In [1]:
# BASE
# ------------------------------------------------------
import pandas as pd  
import numpy as np

In [2]:
model_train = pd.read_csv('model_train.csv')
model_test = pd.read_csv('model_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [3]:
dtype_dict = {'store_nbr': object, 'cluster': object, 'month': object, 'day_of_month': object, 'day_of_year': object, 'week_of_year': object, 'day_of_week': object, 'year': object, 'is_wknd': object, }
model_train = model_train.astype(dtype_dict)
model_test = model_test.astype(dtype_dict)

In [4]:
model_test.isna().sum()

store_nbr       0
family          0
onpromotion     0
city            0
state           0
shop_type       0
cluster         0
oil_price       0
transactions    0
holiday_type    0
population      0
month           0
day_of_month    0
day_of_year     0
week_of_year    0
day_of_week     0
year            0
is_wknd         0
dtype: int64

# Prepare for modeling

In [5]:
def preparing_data_to_model(data, random_state=None):   
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import OneHotEncoder
    # Split data
    target_col = 'sales'
    X = data.drop(target_col, axis=1)
    y = data[target_col]
    
    # Numerical-Categorical Split
    model_train_num = X.select_dtypes(include = np.number)
    model_train_cat = X.select_dtypes(object)
    
    # Scaling numerical data
    transformer = MinMaxScaler().fit(model_train_num)
    model_train_scaled1 = pd.DataFrame(transformer.transform(model_train_num),columns=model_train_num.columns)

    # Encoding categorical data
    encoder = OneHotEncoder(drop='first').fit(model_train_cat)
    column_names = encoder.get_feature_names_out(model_train_cat.columns)
    model_train_encoded = encoder.transform(model_train_cat).toarray()
    encoded = pd.DataFrame(model_train_encoded, columns =column_names)

    # Concatenate data
    model_train_scaled = pd.concat([model_train_scaled1,encoded],axis=1)
    
    print(model_train_scaled.shape, y.shape, transformer, encoder)
    
    return model_train_scaled, y, transformer, encoder

In [6]:
#Fit a logistic regression model on the training data.
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
model = DecisionTreeRegressor()

In [7]:
def store_regression(data, model):
    # Split the data by store_id
    store_data = {}
    for store_id in data['store_nbr'].unique():
        store_data[store_id] = data[data['store_nbr'] == store_id]
    
    # Perform linear regression for each store
    regression_results = {}
    for store_id, store_df in store_data.items():
        X = store_df.drop('sales', axis=1)
        y = store_df['sales']
        
        # Prepare the data for modeling
        X_scaled, y, transformer, encoder = preparing_data_to_model(pd.concat([X, y], axis=1))
        
        # Fit the model
        model.fit(X_scaled, y)
        
        # Calculate RMSE 
        model_pred = model.predict(X_scaled)
        model_rmse = np.sqrt(mean_squared_error(y, model_pred))
        
        # Store the regression results
        regression_results[store_id] = {'model':model,
            'score': model.score(X_scaled, y),
            'RMSE': model_rmse,
            'Predicted':np.round(model_pred[:10],decimals = 1),
            'Real': y[:10],
            'Transformer':transformer,
            'Encoder':encoder                          
        }
    
    return regression_results

In [8]:
results = store_regression(model_train, model)

(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) MinMaxScaler() OneHotEncoder(drop='first')
(55572, 510) (55572,) Min

In [9]:
results

{1: {'model': DecisionTreeRegressor(),
  'score': 1.0,
  'RMSE': 0.0,
  'Predicted': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'Real': 0    0.0
  1    0.0
  2    0.0
  3    0.0
  4    0.0
  5    0.0
  6    0.0
  7    0.0
  8    0.0
  9    0.0
  Name: sales, dtype: float64,
  'Transformer': MinMaxScaler(),
  'Encoder': OneHotEncoder(drop='first')},
 10: {'model': DecisionTreeRegressor(),
  'score': 1.0,
  'RMSE': 0.0,
  'Predicted': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'Real': 33    0.0
  34    0.0
  35    0.0
  36    0.0
  37    0.0
  38    0.0
  39    0.0
  40    0.0
  41    0.0
  42    0.0
  Name: sales, dtype: float64,
  'Transformer': MinMaxScaler(),
  'Encoder': OneHotEncoder(drop='first')},
 11: {'model': DecisionTreeRegressor(),
  'score': 1.0,
  'RMSE': 0.0,
  'Predicted': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'Real': 66    0.0
  67    0.0
  68    0.0
  69    0.0
  70    0.0
  71    0.0
  72    0.0
  73    0.0
  74    0.0
  75    0.0
  Name: 

In [10]:
def make_predict(row):
    store_nbr = row['store_nbr']
    transformer = results[store_nbr]['Transformer']
    encoder = results[store_nbr]['Encoder']
    model = results[store_nbr]['model']
    row = pd.DataFrame(row).T
    row=row.astype({'onpromotion':int,'population':int, 'oil_price': float, 'transactions': float})

    # Numerical-Categorical Split
    model_train_num = row.select_dtypes(include = np.number)
    model_train_cat = row.select_dtypes(object)
    
    # Scaling numerical data
    model_train_scaled1 = pd.DataFrame(transformer.transform(model_train_num),columns=model_train_num.columns)

    # Encoding categorical data
    column_names = encoder.get_feature_names_out(model_train_cat.columns)
    model_train_encoded = encoder.transform(model_train_cat).toarray()
    encoded = pd.DataFrame(model_train_encoded, columns =column_names)

    # Concatenate data
    model_train_scaled = pd.concat([model_train_scaled1,encoded],axis=1)
    sales = model.predict(model_train_scaled)
    
    return sales[0]

In [11]:
predictions = model_test.apply(make_predict, axis=1)

In [12]:
predictions = pd.DataFrame(predictions).reset_index()

In [13]:
sample_submission1 = pd.concat ([sample_submission,predictions], axis=1)

Unnamed: 0,id,sales,index,0
0,3000888,0.0,0,4.000
1,3000889,0.0,1,0.000
2,3000890,0.0,2,8.000
3,3000891,0.0,3,2367.000
4,3000892,0.0,4,0.000
...,...,...,...,...
28507,3029395,0.0,28507,243.917
28508,3029396,0.0,28508,0.000
28509,3029397,0.0,28509,1017.927
28510,3029398,0.0,28510,22.000


In [15]:
sample_submission1=sample_submission1.drop(['sales','index'], axis=1)

In [16]:
sample_submission1 = sample_submission1.rename(columns={0: 'sales'})

In [17]:
sample_submission1.loc[sample_submission1['sales'] < 0, 'sales'] = 0

In [18]:
sample_submission1.to_csv("sample_submission1.csv", index=False)

In [19]:
sample_submission1

Unnamed: 0,id,sales
0,3000888,4.000
1,3000889,0.000
2,3000890,8.000
3,3000891,2367.000
4,3000892,0.000
...,...,...
28507,3029395,243.917
28508,3029396,0.000
28509,3029397,1017.927
28510,3029398,22.000
