In [1]:
import joblib
import pandas as pd

In [2]:
file_path_train = r'C:\Users\nhl08\OneDrive\Documents\AI02\Udemy\Forecasting Crime\cleaned_train.csv'
df_train = pd.read_csv(file_path_train)
df_train.shape

(20000, 22)

In [3]:
file_path_test = r'C:\Users\nhl08\OneDrive\Documents\AI02\Udemy\Forecasting Crime\test.csv'
df_test = pd.read_csv(file_path_test)
df_test.shape

(5000, 21)

In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Location               5000 non-null   object 
 1   Cross_Street           810 non-null    object 
 2   Latitude               5000 non-null   float64
 3   Longitude              5000 non-null   float64
 4   Date_Reported          5000 non-null   object 
 5   Date_Occurred          5000 non-null   object 
 6   Time_Occurred          5000 non-null   float64
 7   Area_ID                5000 non-null   float64
 8   Area_Name              5000 non-null   object 
 9   Reporting_District_no  5000 non-null   float64
 10  Part 1-2               5000 non-null   float64
 11  Modus_Operandi         4316 non-null   object 
 12  Victim_Age             5000 non-null   float64
 13  Victim_Sex             4357 non-null   object 
 14  Victim_Descent         4357 non-null   object 
 15  Prem

In [5]:
date_format = '%m/%d/%Y %I:%M:%S %p'

df_test['Date_Reported'] = pd.to_datetime(df_test['Date_Reported'], format=date_format)
df_test['Date_Occurred'] = pd.to_datetime(df_test['Date_Occurred'], format=date_format)

In [6]:
df_test['Year_Reported'] = df_test['Date_Reported'].dt.year
df_test['Month_Reported'] = df_test['Date_Reported'].dt.month
df_test['Day_Reported'] = df_test['Date_Reported'].dt.day
df_test['Hour_Occurred'] = df_test['Time_Occurred'].apply(lambda x: int(x // 100))

In [7]:
df_cleaned = df_test.drop(columns=['Date_Reported', 'Date_Occurred', 'Cross_Street'])

In [8]:
categorical_columns = df_cleaned.select_dtypes(include='object').columns.tolist()

In [9]:
df_cleaned[categorical_columns].isnull().sum()

Location                  0
Area_Name                 0
Modus_Operandi          684
Victim_Sex              643
Victim_Descent          643
Premise_Description       1
Weapon_Description     3153
Status                    0
Status_Description        0
dtype: int64

In [10]:
victim_sex_mode = df_cleaned['Victim_Sex'].mode()[0]
victim_descent_mode = df_cleaned['Victim_Descent'].mode()[0]
Modus_Operandi_mode = df_cleaned['Modus_Operandi'].mode()[0]
Premise_Description_mode = df_cleaned['Premise_Description'].mode()[0]

In [11]:
df_cleaned['Weapon_Description'] = df_cleaned['Weapon_Description'].fillna('No Weapon Used')
df_cleaned['Victim_Sex'] = df_cleaned['Victim_Sex'].fillna(victim_sex_mode)
df_cleaned['Victim_Descent'] = df_cleaned['Victim_Descent'].fillna(victim_descent_mode)
df_cleaned['Modus_Operandi'] = df_cleaned['Modus_Operandi'].fillna(Modus_Operandi_mode)
df_cleaned['Premise_Description'] = df_cleaned['Premise_Description'].fillna(Premise_Description_mode)

In [12]:
df_cleaned[categorical_columns].isnull().sum()

Location               0
Area_Name              0
Modus_Operandi         0
Victim_Sex             0
Victim_Descent         0
Premise_Description    0
Weapon_Description     0
Status                 0
Status_Description     0
dtype: int64

In [13]:
num = ['float64', 'int32']
numerical_columns = df_cleaned.select_dtypes(include=num).columns.tolist()

In [14]:
df_cleaned[numerical_columns].isnull().sum()

Latitude                    0
Longitude                   0
Time_Occurred               0
Area_ID                     0
Reporting_District_no       0
Part 1-2                    0
Victim_Age                  0
Premise_Code                0
Weapon_Used_Code         3153
Year_Reported               0
Month_Reported              0
Day_Reported                0
dtype: int64

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder

categorical_transformer = Pipeline(steps=[
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)

In [17]:
df_cleaned = preprocessor.fit_transform(df_cleaned)

In [18]:
df_cleaned.shape

(5000, 21)

In [19]:
df_cleaned = pd.DataFrame(df_cleaned, columns=numerical_columns + categorical_columns)

In [20]:
df_cleaned.isnull().sum()

Latitude                 0
Longitude                0
Time_Occurred            0
Area_ID                  0
Reporting_District_no    0
Part 1-2                 0
Victim_Age               0
Premise_Code             0
Weapon_Used_Code         0
Year_Reported            0
Month_Reported           0
Day_Reported             0
Location                 0
Area_Name                0
Modus_Operandi           0
Victim_Sex               0
Victim_Descent           0
Premise_Description      0
Weapon_Description       0
Status                   0
Status_Description       0
dtype: int64

In [21]:
df_cleaned.to_csv('cleaned_test.csv', mode='a', index=False)