In [None]:
import pandas
import altair as alt
import numpy
import scipy
import scipy.stats as stats

In [None]:

import pickle

with open('data.pickle', 'rb') as f:
    data = pickle.load(f, encoding='binary')

data.head(5)


In [None]:
def make_categorical(dataframe, column):
    # make the column of the dataframe categorical
    dataframe[column] = dataframe[column].astype('category')

# get a lst of all the columns in the dataframe
columns = ['ADDR_PCT_CD', 'BORO_NM', 'CRM_ATPT_CPTD_CD', 'HADEVELOPT', 'HOUSING_PSA', 'JURISDICTION_CODE', 'JURIS_DESC', 'KY_CD', 'LAW_CAT_CD', 'LOC_OF_OCCUR_DESC', 'OFNS_DESC', 'PARKS_NM', 'PATROL_BORO', 'PD_CD', 'PD_DESC', 'STATION_NAME', 'SUSP_AGE_GROUP', 'SUSP_RACE', 'SUSP_SEX', 'TRANSIT_DISTRICT', 'VIC_AGE_GROUP', 'VIC_RACE', 'VIC_SEX']

    # make each column categorical
for column in columns:
    make_categorical(data, column)


In [None]:
vic_sex_count = data['VIC_SEX'].value_counts()

# using altair plot the number of crimes per SUSPECT SEX in a bar chart
alt.Chart(vic_sex_count.reset_index()).mark_bar().encode(
    x = alt.X('index', axis= alt.Axis(title = 'Victim Sex')),
    y = alt.Y('VIC_SEX',axis= alt.Axis(format='e', title = 'Number of Complaints'))
).properties(
    title='Number of complaints per victim sex',
    width=1000,
    height=500
)

In [None]:
#make a percentage distribution in a table
vic_sex_counts = data.groupby('VIC_SEX').size().reset_index(name='complaint_count')
vic_sex_counts['complaint_percentage'] = 100 * vic_sex_counts['complaint_count'] / vic_sex_counts['complaint_count'].sum()
vic_sex_counts

In [None]:
vic_race_count = data['VIC_RACE'].value_counts()
# using altair plot the number of crimes per SUSPECT RACE in a bar chart
alt.Chart(vic_race_count.reset_index()).mark_bar().encode(
    x = alt.X('index', axis= alt.Axis(title = 'Victim Race')),
    y = alt.Y('VIC_RACE',axis= alt.Axis(format='e', title = 'Number of Complaints'))    
).properties(
    title='Number of complaints per victim race',
    width=1000,
    height=500
)

In [None]:
#make a percentage distribution in a table
vic_race_counts = data.groupby('VIC_RACE').size().reset_index(name='complaint_count')
vic_race_counts['complaint_percentage'] = 100 * vic_race_counts['complaint_count'] / vic_race_counts['complaint_count'].sum()
vic_race_counts

In [None]:
#create a new column to remove unknown values
data['VIC_RACE2'] = numpy.where(data['VIC_RACE'].str.contains("UNKNOWN"), None, data['VIC_RACE'])


In [None]:
vic_race_count2 = data['VIC_RACE2'].value_counts()
# using altair plot the number of crimes per SUSPECT RACE in a bar chart without the unknown category
alt.Chart(vic_race_count2.reset_index()).mark_bar().encode(
    x = alt.X('index', axis= alt.Axis(title = 'Victim Race')),
    y = alt.Y('VIC_RACE2',axis= alt.Axis(format='e', title = 'Number of complaints'))    
).properties(
    title='Number of complaints per victim race',
    width=1000,
    height=500
)

In [None]:
#make a percentage distribution in a table without the unknown category
vic_race_counts2 = data.groupby('VIC_RACE2').size().reset_index(name='complaint_count')
vic_race_counts2['complaint_percentage'] = 100 * vic_race_counts['complaint_count'] / vic_race_counts['complaint_count'].sum()
vic_race_counts2

In [None]:
#where there is NaN, fill it with unknown
#data['VIC_AGE_GROUP'] = data['VIC_AGE_GROUP'].fillna('UNKNOWN')

In [None]:
#check whether the previous step worked
data['VIC_AGE_GROUP'].unique()

In [None]:
vic_age_count = data['VIC_AGE_GROUP'].value_counts()

categoryNames    = [ '<18', '18-24', '25-44', '45-64', '65+' ]

# using altair plot the number of crimes per VICTIM AGE GROUP in a bar chart
alt.Chart(vic_age_count.reset_index()).mark_bar().encode(
    x = alt.X('index', sort = categoryNames, axis= alt.Axis(title = 'Victim Age Group')),
    y = alt.Y('VIC_AGE_GROUP',axis= alt.Axis(format='e', title = 'Number of Complaints'))
).properties(
    title='Number of complaints per victim age group',
    width=1000,
    height=500
)

In [None]:
#make a percentage distribution in a table
vic_age_counts2 = data.groupby('VIC_AGE_GROUP').size().reset_index(name='complaint_count')
vic_age_counts2['complaint_percentage'] = 100 * vic_age_counts2['complaint_count'] / vic_age_counts2['complaint_count'].sum()
vic_age_counts2

In [None]:
#create a new column where unknown values are removed
data['VIC_AGE_GROUP2'] = numpy.where(data['SUSP_AGE_GROUP'].str.contains("UNKNOWN"), None, data['SUSP_AGE_GROUP'])



In [None]:
vic_age_count2 = data['VIC_AGE_GROUP2'].value_counts()

categoryNames    = [ '<18', '18-24', '25-44', '45-64', '65+' ]

# using altair plot the number of crimes per VICTIM AGE GROUP in a bar chart without the unknown category
alt.Chart(vic_age_count2.reset_index()).mark_bar().encode(
    x = alt.X('index', sort = categoryNames, axis= alt.Axis(title = 'Victim Age Group')),
    y = alt.Y('VIC_AGE_GROUP2',axis= alt.Axis(format='e', title = 'Number of Complaints'))
).properties(
    title='Number of complaints per victim age group',
    width=1000,
    height=500
)

In [None]:
#make a percentage distribution in a table without unknown category
vic_age_counts = data.groupby('VIC_AGE_GROUP2').size().reset_index(name='complaint_count')
vic_age_counts['complaint_percentage'] = 100 * vic_age_counts['complaint_count'] / vic_age_counts['complaint_count'].sum()
vic_age_counts

In [None]:
#get year from occ
data['Year']=data['OCC'].dt.year 

In [None]:
#turn year into a string
data['Year'] = data['Year'].astype(str)

In [None]:
#group complaints per year
year_counts = data.groupby('Year').size().reset_index(name='complaint_count')

In [None]:
year_counts

In [None]:
#group so each year the distribution of complaints by victim sex is shown
year_counts_sex = data.groupby(['Year', 'VIC_SEX']).size().reset_index(name='complaint_count')

#turn year and complaint count into an integer
year_counts_sex['Year'] = year_counts_sex['Year'].astype(int)
year_counts_sex['complaint_count'] = year_counts_sex['complaint_count'].astype(int)

#turn victim sex into a string
year_counts_sex['VIC_SEX'] = year_counts_sex['VIC_SEX'].astype(str)


In [None]:
#plot a chart where the number of complaints per victim sex per year is shown
alt.Chart(year_counts_sex).mark_line().encode(
    alt.X('Year:O', axis = alt.Axis(title = 'Year')),
    alt.Y('complaint_count:Q', stack = "zero", axis = alt.Axis(title = 'Complaint Count')),
    alt.Color('VIC_SEX:O', scale=alt.Scale(scheme='lighttealblue'),title = 'Sex')
).properties(
    title='Number of complaints per victim sex per year',
    width = 1000,
    height = 500
)

In [None]:
#group so each year the distribution of complaints by victim race is shown
year_counts_race = data.groupby(['Year', 'VIC_RACE']).size().reset_index(name='complaint_count')

#turn year and complaint count into an integer
year_counts_race['Year'] = year_counts_race['Year'].astype(int)
year_counts_race['complaint_count'] = year_counts_race['complaint_count'].astype(int)

#turn victim race into a string
year_counts_race['VIC_RACE'] = year_counts_race['VIC_RACE'].astype(str)


In [None]:
#plot a chart where the number of complaints per victim race per year is shown
alt.Chart(year_counts_race).mark_line().encode(
    alt.X('Year:O', axis = alt.Axis(title = 'Year')),
    alt.Y('complaint_count:Q', stack = "zero", axis = alt.Axis(title = 'Complaint Count')),
    alt.Color('VIC_RACE:O', scale=alt.Scale(scheme='lighttealblue'),title = 'Race')
).properties(
    title='Number of complaints per victim race per year',
    width = 1000,
    height = 500
)

In [None]:
#group so each year the distribution of complaints by victim age group is shown
year_counts_age = data.groupby(['Year', 'VIC_AGE_GROUP']).size().reset_index(name='complaint_count')

#turn year and complaint count into an integer
year_counts_age['Year'] = year_counts_age['Year'].astype(int)
year_counts_age['complaint_count'] = year_counts_age['complaint_count'].astype(int)

#turn victim age group into a string
year_counts_age['VIC_AGE_GROUP'] = year_counts_age['VIC_AGE_GROUP'].astype(str)


In [None]:
#plot a chart where the number of complaints per victim age group per year is shown
alt.Chart(year_counts_age).mark_line().encode(
    alt.X('Year:O', axis = alt.Axis(title = 'Year')),
    alt.Y('complaint_count:Q', stack = "zero", axis = alt.Axis(title = 'Complaint Count')),
    alt.Color('VIC_AGE_GROUP:O', scale=alt.Scale(scheme='lighttealblue'),title = 'Age Group')
).properties(
    title='Number of complaints per age group per year',
    width = 1000,
    height = 500
)

In [None]:
#make a new dataset with only sex category U
data6 = data.loc[data['VIC_SEX'] == 'U']

In [None]:
#make a new dataset with only sex category D
data2 =data.loc[data['VIC_SEX'] == 'D']

In [None]:
#make a new dataset with only sex category E
data3 = data.loc[data['VIC_SEX'] == 'E']

In [None]:
data2

In [None]:
#count victim race of people with sex category D to see what gender they have
data2['VIC_RACE'].value_counts()

In [None]:
#count victim race of people with sex category E to see what gender they have
data3['VIC_RACE'].value_counts()

In [None]:
#count victim race of people with sex category U to see what gender they have
data6['VIC_RACE'].value_counts()

In [None]:
#only select data in year 2010, as that is where the population census data is from. Done for the chi square test
demographic = data.loc[data['Year'] == '2010']


In [None]:
demographic

In [None]:
#count amount of complaints by victim sex
sex_demographic = demographic.groupby(['VIC_SEX']).size().reset_index(name='complaint_count')

In [None]:
#drop columns which aren't used
sex_demographic1 = sex_demographic.drop([1,4])
sex_demographic1

In [None]:
#https://www.nyc.gov/assets/smallbizfirst/downloads/pdf/small-business-first-report.pdf, D bases on this report, where it is stated that there are 220000 businesses in new york city
#https://data.cityofnewyork.us/City-Government/Census-Demographics-at-the-NYC-City-Council-distri/ye4r-qpmp, male and female distribution based on this report
#actual nyc population data
sex_demographic1['SEX_DEM'] = [220000, 4214074, 3794204]

In [None]:
#create the expected distribution of complaints per category based on the population demographics
sex_demographic1['SEX_DEM_PER'] = sex_demographic1['SEX_DEM'] / sex_demographic1['SEX_DEM'].sum()
sex_demographic1['EXP_COM'] = sex_demographic1['SEX_DEM_PER'] * sex_demographic1['complaint_count'].sum()
sex_demographic1['EXP_COM'] = sex_demographic1['EXP_COM'].round(0)
sex_demographic1

In [None]:
#create a table with only the relevant values
data2 = [sex_demographic1['complaint_count'],sex_demographic1['SEX_DEM']]

In [None]:
#chi square test
scipy.stats.chisquare(data2)


In [None]:
#count amount of complaints by victim race
race_demographic = demographic.groupby(['VIC_RACE2']).size().reset_index(name='complaint_count')

In [None]:
race_demographic

In [None]:
#https://slate.com/news-and-politics/2015/04/staten-island-for-sale-the-munsee-indians-sold-staten-island-under-duress-but-not-before-new-york-made-some-surprising-concessions.html, american indian/alaskan native from this article
#https://data.cityofnewyork.us/City-Government/Census-Demographics-at-the-NYC-City-Council-distri/ye4r-qpmp, other races based on this report
#https://furmancenter.org/files/sotc/The_Changing_Racial_and_Ethnic_Makeup_of_New_York_City_Neighborhoods_11.pdf, black/white hispanic composition based on this report
#actual race demographics in nyc
race_demographic['RACE_DEM'] = [111000, 783058, 1962154, 103707, 2801267, 198771
]

In [None]:
race_demographic

In [None]:
#create the expected distribution of complaints per category based on the population demographics
race_demographic['RACE_DEM_PER'] = race_demographic['RACE_DEM'] / race_demographic['RACE_DEM'].sum()
race_demographic['EXP_COM'] = race_demographic['RACE_DEM_PER'] * race_demographic['complaint_count'].sum()
race_demographic['EXP_COM'] = race_demographic['EXP_COM'].round(0)

In [None]:
race_demographic

In [None]:
#create a table with only the relevant values
data3 = [race_demographic['complaint_count'],race_demographic['EXP_COM']]

In [None]:
#chi square test
scipy.stats.chisquare(data3)


In [None]:
#count amount of complaints by victim sex and drop unused column
age_demographic = demographic.groupby(['VIC_AGE_GROUP']).size().reset_index(name='complaint_count')
#age_demographic = age_demographic.drop([5])

In [None]:
#actual age demographics in nyc
age_demographic['AGE_DEM'] = [803012, 2631301, 1695839, 937857, 1940269]

In [None]:
#create the expected distribution of complaints per category based on the population demographics
age_demographic['AGE_DEM_PER'] = age_demographic['AGE_DEM'] / age_demographic['AGE_DEM'].sum()
age_demographic['EXP_COM'] = age_demographic['AGE_DEM_PER'] * age_demographic['complaint_count'].sum()
age_demographic['EXP_COM'] = age_demographic['EXP_COM'].round(0)

In [None]:
#create a table with only the relevant values
data4 = [age_demographic['complaint_count'],age_demographic['AGE_DEM']]

In [None]:
age_demographic

In [None]:
#chi square test
scipy.stats.chisquare(data3)


In [None]:
#only look at the data where vic race is unknown
data5 =data.loc[data['VIC_RACE'] == 'UNKNOWN']

In [None]:
#look at the sex of the victims whose race is unknown
data5['VIC_SEX'].value_counts()