# NY Crime Data Prep

In [49]:
# Import modules.

%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn import linear_model

# Suppress error.
import warnings
warnings.filterwarnings(action='ignore', module='scipy',
                       message='^internal gelsd')

In [50]:
# Load data.

df = pd.read_csv('~/src/data/unit2/nycrimedata.csv')
print(df.shape)
print(df.dtypes)
df.head()

(348, 13)
City                                       object
Population                                  int64
Violent\ncrime                              int64
Murder and\nnonnegligent\nmanslaughter      int64
Rape\n(revised\ndefinition)1              float64
Rape\n(legacy\ndefinition)2                 int64
Robbery                                     int64
Aggravated\nassault                         int64
Property\ncrime                             int64
Burglary                                    int64
Larceny-\ntheft                             int64
Motor\nvehicle\ntheft                       int64
Arson3                                    float64
dtype: object


Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
0,Adams Village,1861,0,0,,0,0,0,12,2,10,0,0.0
1,Addison Town and Village,2577,3,0,,0,0,3,24,3,20,1,0.0
2,Akron Village,2846,3,0,,0,0,3,16,1,15,0,0.0
3,Albany,97956,791,8,,30,227,526,4090,705,3243,142,
4,Albion Village,6388,23,0,,3,4,16,223,53,165,5,


In [51]:
df.describe()

Unnamed: 0,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson3
count,348.0,348.0,348.0,0.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,187.0
mean,40037.63,201.594828,1.566092,,5.864943,72.902299,121.261494,792.606322,119.683908,637.017241,35.905172,1.871658
std,450037.4,2815.268504,18.303673,,60.425452,1031.032873,1706.13173,7659.724746,924.948789,6346.054451,403.423826,10.693411
min,526.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3003.0,2.0,0.0,,0.0,0.0,1.0,40.5,6.0,31.0,0.0,0.0
50%,7233.5,6.0,0.0,,0.0,1.0,4.0,112.5,17.5,94.0,2.0,0.0
75%,18427.5,22.0,0.0,,2.0,5.0,14.0,341.0,51.25,287.25,7.0,1.0
max,8396126.0,52384.0,335.0,,1112.0,19170.0,31767.0,141971.0,16606.0,117931.0,7434.0,132.0


## Clean the data:

In [52]:
df.columns = ['city', 'population', 'violent_crime', 'murder', 'rape_1',
              'rape_2', 'robbery', 'aggravated_assault', 'property_crime',
              'burglary', 'larceny_theft', 'motor_vehicle_theft', 'arson3']
df.tail()

Unnamed: 0,city,population,violent_crime,murder,rape_1,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson3
343,Woodbury Town,10685,3,0,,0,2,1,541,9,529,3,
344,Woodridge Village,829,7,0,,0,0,7,17,8,9,0,0.0
345,Woodstock Town,5931,2,0,,0,0,2,58,13,45,0,
346,Yonkers,199134,1036,6,,25,390,615,2368,470,1662,236,10.0
347,Yorktown Town,36643,15,0,,0,2,13,334,45,287,2,


In [53]:
df.isnull().sum()

city                     0
population               0
violent_crime            0
murder                   0
rape_1                 348
rape_2                   0
robbery                  0
aggravated_assault       0
property_crime           0
burglary                 0
larceny_theft            0
motor_vehicle_theft      0
arson3                 161
dtype: int64

In [54]:
df = df.drop(['rape_1'], axis=1)
df.tail()

Unnamed: 0,city,population,violent_crime,murder,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson3
343,Woodbury Town,10685,3,0,0,2,1,541,9,529,3,
344,Woodridge Village,829,7,0,0,0,7,17,8,9,0,0.0
345,Woodstock Town,5931,2,0,0,0,2,58,13,45,0,
346,Yonkers,199134,1036,6,25,390,615,2368,470,1662,236,10.0
347,Yorktown Town,36643,15,0,0,2,13,334,45,287,2,


In [59]:
df.fillna(0, inplace=True)
df.isnull().sum()
#df.tail()

city                   0
population             0
violent_crime          0
murder                 0
rape_2                 0
robbery                0
aggravated_assault     0
property_crime         0
burglary               0
larceny_theft          0
motor_vehicle_theft    0
arson3                 0
dtype: int64

## Creating categorical features:

$$ Property crime = \alpha + Population + Population^2 + Murder + Robbery$$ <br>

In [60]:
df['populationSquared'] = df['population']*df['population']
df.head()

Unnamed: 0,city,population,violent_crime,murder,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson3,populationSquared
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0,3463321
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0,6640929
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0,8099716
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,0.0,9595377936
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0.0,40806544


In [61]:
df['murder_cat'] = df['murder'].apply(lambda x: 1 if x > 0 else 0)
df['robbery_cat'] = df['robbery'].apply(lambda x: 1 if x > 0 else 0)
df.head()

Unnamed: 0,city,population,violent_crime,murder,rape_2,robbery,aggravated_assault,property_crime,burglary,larceny_theft,motor_vehicle_theft,arson3,populationSquared,murder_cat,robbery_cat
0,Adams Village,1861,0,0,0,0,0,12,2,10,0,0.0,3463321,0,0
1,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0,6640929,0,0
2,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0,8099716,0,0
3,Albany,97956,791,8,30,227,526,4090,705,3243,142,0.0,9595377936,1,1
4,Albion Village,6388,23,0,3,4,16,223,53,165,5,0.0,40806544,0,1
