In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [2]:
df_train = pd.read_csv('data/train.csv')

In [3]:
df_train.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   id           3000888 non-null  int64  
 1   date         3000888 non-null  object 
 2   store_nbr    3000888 non-null  int64  
 3   family       3000888 non-null  object 
 4   sales        3000888 non-null  float64
 5   onpromotion  3000888 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB


In [65]:
class PreprocessingTranformer(BaseEstimator):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        self.feature_names = X.columns
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        # convert date feature to datetime
        X_.date = pd.to_datetime(X_.date)
        return X_

    def get_feature_names_out(self):
        return self.feature_names
        

In [71]:
pipeline_preprocessing = ColumnTransformer([
#    ('preprocessing_tranformer', PreprocessingTranformer(),['id','date']),
    ('standard_scaler', StandardScaler(),['store_nbr','sales','onpromotion']),
    ('one_hot_encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['family'])
])

In [72]:
df_train_proc = pd.DataFrame(pipeline_preprocessing.fit_transform(df_train))
df_train_proc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,-1.700267,-0.324661,-0.213012,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.700267,-0.324661,-0.213012,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.700267,-0.324661,-0.213012,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.700267,-0.324661,-0.213012,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.700267,-0.324661,-0.213012,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,-1.186979,0.072920,-0.213012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3000884,-1.186979,-0.184413,-0.131172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3000885,-1.186979,1.871105,11.899391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3000886,-1.186979,-0.214861,0.441712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [74]:
pipeline_preprocessing.get_params()

{'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('standard_scaler',
   StandardScaler(),
   ['store_nbr', 'sales', 'onpromotion']),
  ('one_hot_encoder',
   OneHotEncoder(handle_unknown='ignore', sparse=False),
   ['family'])],
 'verbose': False,
 'verbose_feature_names_out': True,
 'standard_scaler': StandardScaler(),
 'one_hot_encoder': OneHotEncoder(handle_unknown='ignore', sparse=False),
 'standard_scaler__copy': True,
 'standard_scaler__with_mean': True,
 'standard_scaler__with_std': True,
 'one_hot_encoder__categories': 'auto',
 'one_hot_encoder__drop': None,
 'one_hot_encoder__dtype': numpy.float64,
 'one_hot_encoder__handle_unknown': 'ignore',
 'one_hot_encoder__sparse': False}

In [76]:
import sklearn
print(sklearn.__file__)

/opt/homebrew/Cellar/jupyterlab/3.2.5/libexec/lib/python3.9/site-packages/sklearn/__init__.py


In [6]:
def pipeline_preprocessing(df):
    df_copy = df.copy()
    # convert date feature to datetime
    df_copy.date = pd.to_datetime(df_copy.date)
    
    return df_copy

df_train_proc = pipeline_preprocessing(df_train)

pickle.dump(df_train_proc, open('df_train_processed', 'wb'))

df_train_proc.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [None]:
ohencoder = OneHotEncoder()
ohencoder.