In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [7]:
# Load data
df = pd.read_csv("../data/train.csv")

In [8]:
# Date extraction
df['Dates'] = pd.to_datetime(df['Dates'])
df['Hour'] = df['Dates'].dt.hour
df['Day'] = df['Dates'].dt.day
df['Month'] = df['Dates'].dt.month
df['Weekday'] = df['Dates'].dt.weekday

In [9]:
# Selecting columns
categorical_cols = ['DayOfWeek', 'PdDistrict', 'Resolution']
numeric_cols = ['X', 'Y', 'Hour', 'Day', 'Month', 'Weekday']

In [10]:
# Defining Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', MinMaxScaler(), numeric_cols)
    ]
)

In [11]:
# Creating pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [12]:
# Fit and transform data
features_scaled = pipeline.fit_transform(df)

In [13]:
# Saving preprocessed data
preprocessed_df = pd.DataFrame(features_scaled)
preprocessed_df.to_csv("../data/preprocessed_data.csv", index=False)

# Saving the pipeline
joblib.dump(pipeline, '../models/preprocessing_pipeline.joblib')

['../models/preprocessing_pipeline.joblib']