In [8]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

path = "../datasets/"
data = arff.loadarff('%sSeattle_Crime_Data_06-23-2019-4.arff' % path)
df = pd.DataFrame(data[0])
for column in df.columns:
    # decode bytes to string for each column
    df[column] = df[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

df

Unnamed: 0,Report_Number,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
0,1.975000e+12,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK
1,1.976000e+12,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,UNKNOWN,?,?,UNKNOWN
2,1.979000e+12,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK
3,1.981000e+13,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP
4,1.981000e+12,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS
...,...,...,...,...,...,...,...,...,...
523585,2.019000e+12,1713.0,1713.0,FAMILY OFFENSE-NONVIOLENT,CHILD-OTHER,SOUTH,O,O3,MID BEACON HILL
523586,2.019000e+12,730.0,1721.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,EAST,C,C2,MONTLAKE/PORTAGE BAY
523587,2.019000e+12,1724.0,1724.0,ROBBERY-COMMERCIAL,ROBBERY-BUSINESS-BODYFORCE,SOUTH,S,S2,RAINIER BEACH
523588,2.019000e+12,1750.0,1904.0,THEFT-SHOPLIFT,THEFT-SHOPLIFT,NORTH,L,L2,NORTHGATE


In [9]:
for column in df.columns:
    if df.columns.dtype == object:
        print(column + "=" * 40)
        value_counts = df[column].value_counts()
        print(value_counts)


Report_Number
1.975000e+12    1
2.015000e+13    1
2.015000e+13    1
2.015000e+13    1
2.015000e+13    1
               ..
2.012000e+13    1
2.012000e+13    1
2.012000e+13    1
2.012000e+13    1
2.019000e+12    1
Name: count, Length: 523590, dtype: int64
Occurred_Time
2200.0    13858
1800.0    13804
0.0       13420
2000.0    12836
1200.0    12598
          ...  
649.0        40
607.0        39
546.0        39
627.0        37
551.0        36
Name: count, Length: 1440, dtype: int64
Reported_Time
1300.0    824
1600.0    809
1530.0    799
1400.0    790
1230.0    787
         ... 
501.0      89
333.0      89
542.0      85
417.0      85
349.0      85
Name: count, Length: 1440, dtype: int64
Crime_Subcategory
CAR PROWL                              148263
THEFT-ALL OTHER                         54419
THEFT-SHOPLIFT                          48638
BURGLARY-RESIDENTIAL                    46843
MOTOR VEHICLE THEFT                     43529
BURGLARY-COMMERCIAL                     23531
THEFT-BUILDING

# Preprocessing

In [10]:
print(df.columns)
df = df.drop('Report_Number', axis=1)
df

Index(['Report_Number', 'Occurred_Time', 'Reported_Time', 'Crime_Subcategory',
       'Primary_Offense_Description', 'Precinct', 'Sector', 'Beat',
       'Neighborhood'],
      dtype='object')


Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Precinct,Sector,Beat,Neighborhood
0,900.0,1500.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTH,R,R3,LAKEWOOD/SEWARD PARK
1,1.0,2359.0,SEX OFFENSE-OTHER,SEXOFF-INDECENT LIBERTIES,UNKNOWN,?,?,UNKNOWN
2,1600.0,1430.0,CAR PROWL,THEFT-CARPROWL,EAST,G,G2,CENTRAL AREA/SQUIRE PARK
3,2029.0,2030.0,HOMICIDE,HOMICIDE-PREMEDITATED-WEAPON,SOUTH,S,S2,BRIGHTON/DUNLAP
4,2000.0,435.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,SOUTHWEST,W,W3,ROXHILL/WESTWOOD/ARBOR HEIGHTS
...,...,...,...,...,...,...,...,...
523585,1713.0,1713.0,FAMILY OFFENSE-NONVIOLENT,CHILD-OTHER,SOUTH,O,O3,MID BEACON HILL
523586,730.0,1721.0,BURGLARY-RESIDENTIAL,BURGLARY-FORCE-RES,EAST,C,C2,MONTLAKE/PORTAGE BAY
523587,1724.0,1724.0,ROBBERY-COMMERCIAL,ROBBERY-BUSINESS-BODYFORCE,SOUTH,S,S2,RAINIER BEACH
523588,1750.0,1904.0,THEFT-SHOPLIFT,THEFT-SHOPLIFT,NORTH,L,L2,NORTHGATE


In [11]:
one_hot_encoding_limit = 10

ordinal_Encoder = OrdinalEncoder()
one_hot_encoder = OneHotEncoder()

for column in df.columns:
    if column in ['Occurred_Time', 'Reported_Time']:
        continue

    unique_values = df[column].nunique()

    if unique_values == 1:
        data.drop(column)
        continue
    if unique_values <= one_hot_encoding_limit:
        encoded_features = one_hot_encoder.fit_transform(df[[column]])
        encoded_features_df = pd.DataFrame(encoded_features.toarray(),
                                           columns=one_hot_encoder.get_feature_names_out([column]))
        df = pd.concat([df, encoded_features_df], axis=1)
        df.drop(column, axis=1, inplace=True)
    else:
        df[column] = ordinal_Encoder.fit_transform(df[[column]])

df

Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Primary_Offense_Description,Sector,Beat,Neighborhood,Precinct_?,Precinct_EAST,Precinct_NORTH,Precinct_SOUTH,Precinct_SOUTHWEST,Precinct_UNKNOWN,Precinct_WEST
0,900.0,1500.0,6.0,18.0,17.0,50.0,28.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,2359.0,24.0,112.0,2.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1600.0,1430.0,8.0,123.0,8.0,20.0,8.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,2029.0,2030.0,13.0,43.0,18.0,53.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2000.0,435.0,6.0,18.0,22.0,62.0,49.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523585,1713.0,1713.0,11.0,29.0,15.0,44.0,32.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
523586,730.0,1721.0,6.0,18.0,4.0,5.0,34.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
523587,1724.0,1724.0,21.0,99.0,18.0,53.0,46.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
523588,1750.0,1904.0,28.0,130.0,12.0,32.0,41.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [12]:
# split dataframe to features and target
X = df.drop('Primary_Offense_Description', axis=1)  # Features
y = df['Primary_Offense_Description']            #target 

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,Occurred_Time,Reported_Time,Crime_Subcategory,Sector,Beat,Neighborhood,Precinct_?,Precinct_EAST,Precinct_NORTH,Precinct_SOUTH,Precinct_SOUTHWEST,Precinct_UNKNOWN,Precinct_WEST
6916,2108.0,2108.0,17.0,14.0,41.0,41.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
70131,2100.0,623.0,6.0,14.0,40.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
64578,2130.0,1033.0,16.0,5.0,11.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
30304,2150.0,2335.0,17.0,18.0,52.0,37.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
218777,1858.0,1858.0,25.0,18.0,53.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,711.0,711.0,2.0,10.0,26.0,22.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
365838,500.0,822.0,8.0,10.0,26.0,48.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
131932,1135.0,1151.0,28.0,13.0,37.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
146867,2100.0,105.0,6.0,21.0,56.0,48.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
