# Reading and cleaning the data

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Importing the crime dataset:

In [2]:
df = pd.read_csv('NYPD_Complaint_Data_Historic.csv/NYPD_Complaint_Data_Historic.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### Exploring the dataset:

In [3]:
df.columns

Index(['CMPLNT_NUM', 'CMPLNT_FR_DT', 'CMPLNT_FR_TM', 'CMPLNT_TO_DT',
       'CMPLNT_TO_TM', 'ADDR_PCT_CD', 'RPT_DT', 'KY_CD', 'OFNS_DESC', 'PD_CD',
       'PD_DESC', 'CRM_ATPT_CPTD_CD', 'LAW_CAT_CD', 'BORO_NM',
       'LOC_OF_OCCUR_DESC', 'PREM_TYP_DESC', 'JURIS_DESC', 'JURISDICTION_CODE',
       'PARKS_NM', 'HADEVELOPT', 'HOUSING_PSA', 'X_COORD_CD', 'Y_COORD_CD',
       'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT',
       'Latitude', 'Longitude', 'Lat_Lon', 'PATROL_BORO', 'STATION_NAME',
       'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX'],
      dtype='object')

#### Getting needed columns:

In [4]:
df = df[['CMPLNT_FR_DT','CMPLNT_FR_TM','OFNS_DESC','PD_DESC','LAW_CAT_CD','Latitude','Longitude']]

In [5]:
# Rename accordingly
df = df.rename(columns={'CMPLNT_FR_DT':'Date', 'CMPLNT_FR_TM':'Time', 'OFNS_DESC':'Global_desc',
                        'PD_DESC':'Specific_desc', 'LAW_CAT_CD':'Type_of_felony'})

### Handeling missing values:

In [6]:
df.isnull().sum()

Date                655
Time                 48
Global_desc       18813
Specific_desc      5546
Type_of_felony        0
Latitude          24064
Longitude         24064
dtype: int64

In [7]:
df = df.dropna()

### Selecting a subset of the data:

In [8]:
# we had 6.5 + mil instances. We take the last 5 years into account
df = df[df.Date.apply(lambda x: int(x.split('/')[-1])) >= 2014]

In [9]:
df = df.sort_values(by = 'Date')

### Converting date and time to timestamps:

In [10]:
df['ts'] = pd.to_datetime(df['Date'] + df['Time'], format='%m/%d/%Y%H:%M:%S', errors = 'coerce')

In [11]:
df['ts'] = df.start.apply(lambda x: datetime(x.year, x.month, x.day, x.hour).timestamp())

### Exporting the dataset:

In [14]:
df.to_csv('dataset.csv')