In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import rcParams
import matplotlib as plt
import plotly.express as px
import folium
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster

crimes = pd.read_csv(r'C:\Users\user\OneDrive\Desktop\PCPP\Project\dc_crime_add_vars.csv')

crimes.head(3)

In [None]:
#copy crimes into crimes_df
crimes_df = crimes.copy(deep=True)
crimes_df['REPORT_DAT']
report_date = []

#splitting the string data in 'REPORT_DAT' to get the dates and append it into a new column named 'Report Dates'
for elem in crimes_df['REPORT_DAT']:
    
    report_date.append(elem.split(" ")[0])    
    
crimes_df['Report Dates'] = report_date
#drop unrelated columns
crimes_df = crimes_df.drop(["Unnamed: 0",'X'], axis=1).sort_values(by='Report Dates').reset_index(drop=True)
crimes_df.head(3).T

In [None]:
crimes_df.shape

In [None]:
crimes_df['count'] = '1'
crimes_df.head(2)

In [None]:
#rename YBLOCK XBLOCK as Latitude and Longitude
crimes_df = crimes_df.rename(columns={'YBLOCK': 'Latitude', 'XBLOCK': 'Longitude'})
crimes_df.head().T
points = []
for index, row in crimes_df.iterrows():  
    points.append(str(row['Latitude'])+","+str(row['Longitude']))
    
#input 'Geopoints' as one of the columns
crimes_df['Geopoints'] = points
crimes_df.head(3)

In [None]:
trend = sns.catplot(x='year', data=crimes_df, kind = 'count',height =6, color='steelblue').set(title="Crime Trends Over the Years")
trend.set_axis_labels('Year', 'Number of Crimes')

"""observation
- highest crime year 2014
-generally on an increase trend over the years, 2017 data incomplete as it ended at 9/9/2017
"""

In [None]:
yr_2017 = crimes_df.loc[(crimes_df.year == 2017)]
yr_2017.tail().T

In [None]:
trend2 = sns.catplot(x='OFFENSE', data=crimes_df, kind = 'count',height =6, color='steelblue').set(title="Common Crime Types")
trend2.set_axis_labels('Crime Types', 'Crime Rate')
trend2.set_xticklabels(rotation = 90)

In [None]:
trend3 = sns.catplot(x='METHOD', data=crimes_df, kind = 'count',height =6, color='steelblue').set(title="Common Crime Methods Used")
trend3.set_axis_labels('Crime Methods', 'Crime rate')
trend3.set_xticklabels(rotation = 90)

In [None]:
trend4 = sns.catplot(x='SHIFT', data=crimes_df, kind = 'count',height =6, color='steelblue').set(title="Crime and Shifts")
trend4.set_axis_labels('Shifts', 'Crime Rate')
trend4.set_xticklabels(rotation = 90)

In [None]:
trend5 = sns.catplot(x='PSA', data=crimes_df, kind = 'count', color='steelblue').set(title="Police Service Area VS Crime Rates")
trend5.set_axis_labels('No. of Police Serviced Areas', 'Crime Rate')
trend5.set_xticklabels(rotation = 90)
trend5.fig.set_figwidth(8)
trend5.fig.set_figheight(12)

In [None]:
crimes_df.isnull().sum()

In [None]:
missing_df = crimes_df.copy()
missing_df.START_DATE = missing_df.START_DATE.fillna(0)
missing_start = missing_df.loc[(missing_df.START_DATE == 0)]
missing_start

In [None]:
violent = missing_start.loc[(missing_start.crimetype == "Violent")]
non_violent = missing_start.loc[(missing_start.crimetype == "Non-Violent")]
violent.head(3)

In [None]:
missing_start_date = sns.catplot(x='crimetype', data=missing_start, kind = 'count',height =6, color='steelblue').set(title="Data with Missing Start Date")
missing_start_date.set_axis_labels('crimetype', 'Crime Rate')


In [None]:
crime_type = sns.catplot(x='OFFENSE', data=non_violent, kind = 'count',height =6, color='steelblue').set(title="Offense of Non-Violent cases in missing Start Dates")
crime_type.set_axis_labels('OFFENSE', 'Crime Rate')

In [None]:
crime_type = sns.catplot(x='OFFENSE', data=violent, kind = 'count',height =6, color='steelblue').set(title="Offense of Non-Violent cases in missing Start Dates")
crime_type.set_axis_labels('OFFENSE', 'Crime Rate')

In [None]:
"""from above graphs I am guessing that reason behind for the missing start of case dates could be that the victims 
did not survive the assault thus it might have resulted in other people reporting it at a later date.
For non-violent cases, maybe the mode of communication was lost during the crime eg. thefts
Although the missing data was small, 13 missing START_DATE values it still provided interesting insights"""

In [None]:
#decides to fill null of START_DATE with REPORT_DAT
crimes_df.START_DATE = crimes_df.START_DATE.fillna(crimes_df.REPORT_DAT)

In [None]:
#checking if fillna was done right
crimes_df.loc[(crimes_df.REPORT_DAT == "1/15/2015 11:25:00 AM")].T

In [None]:
crimes_df.isnull().sum()

In [None]:
#fill "N/A" for all other missing data
crimes_df = crimes_df.fillna("N/A")
crimes_df.isnull().sum()

In [None]:
"""As 2014 is has the highest crime rate out of all in the dataset, I will explore more on it"""
#choosing year 2014 for comparison as it has the highest crime rates 
#as 2017 data is incomplete and 2016 crime rate seems similar to 2015
crimes_2014 = crimes_df.loc[(crimes_df['year']=='2014')]
crimes_2014.reset_index(drop=True, inplace=True)
"""Used to get Neighbourhood and Postal code from coordinates using geopy"""
"""takes forever to load, already save results in crimes_2014.csv"""
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="geoapiExercises")
# Neighbourhood = []
# Postal = []
# # Latitude & Longitude input
# for index, row in crimes_2014.iterrows():
# #     print(row['Latitude'])
#     location = geolocator.reverse(row['Geopoints'])
    
# #     print(location)
#     hood = location.raw.get('address').get('neighbourhood')
#     postal = location.raw.get('address').get('postcode')
# #     print(hood," ",postal)
#     Neighbourhood.append(hood)
#     Postal.append(postal)
#     #print for every 50 index
#     if index % 50 == 0:
#         print(index, Neighbourhood[-1],Postal[-1])

# crimes_2014['Neighbourhood'] = Neighbourhood
# crimes_2014['Postal'] = Postal

# #save file as crimes_2014.csv
# crimes_2014.to_csv('crimes_2014.csv', index=False)

"""end of code"""
"""Tree map data preparation"""
#read file from newly created crimes_2014.csv
crimes_2014 = pd.read_csv(r'C:\Users\user\OneDrive\Desktop\PCPP\Project\crimes_2014.csv')

#replacing all null values in column Neighbourhood to 'Others'
crimes_2014['Neighbourhood'] = crimes_2014['Neighbourhood'].fillna('Others')

#set a column as index
crimes_2014['crimes'] = crimes_2014.index

#set a column as all ones to count the number of crimes
crimes_2014['count'] = 1

#drop all null values in crimes_2014
crimes_2014.dropna(inplace = True)
crimes_2014.isnull().sum()
#dropping 'Others' in Neighbourhood column of crimes_2014 to see clear pattern in Treemap
crimes_2014_hood = crimes_2014[crimes_2014.Neighbourhood != 'Others']
crimes_2014_hood['Neighbourhood']


'''
Treemap was to find which neighbourhood has the most crime rates

'''

fig = px.treemap(crimes_2014_hood,values='count', color='crimes', 
                 color_continuous_scale='RdBu_r',
                 path=[px.Constant('Washington DC Yr 2014'),'DISTRICT','Neighbourhood','crimetype']
)
fig.data[0].hovertemplate = "%{label}<br>Crimes:%{value}"
fig.show()

In [None]:
#to show concentration of violent crimes in DC washington on map
fmap = folium.Map(location=[38.89511, -77.03637], height=400, width=1000, zoom_start=12)

violent_crime = crimes_df.loc[(crimes_df.crimetype == "Violent")]
latlong = []
lat = list(violent_crime.Latitude)
lon = list(violent_crime.Longitude)


for i in range(len(violent_crime)):
    latlong += [[lat[i],lon[i]]]
    
HeatMap(latlong,radius=8,gradient={'0':'Navy', '0.25':'Blue','0.5':'Green', '0.75':'Red','1': 'Black'}).add_to(fmap)
fmap


In [None]:
offense = list(crimes_df.OFFENSE)

cluster_map = folium.Map(location=[38.89511, -77.03637], height=400, width=1000, zoom_start=12)
marker_cluster = MarkerCluster().add_to(cluster_map)

for i in range(len(latlong)):
    
    folium.Marker(
        location=latlong[i],
        popup = offense[i],
        radius = 3,
        tooltip="Click to view type of offense",
        color='#FFBA00',
        fill_color='#FFBA00'
    ).add_to(marker_cluster)

cluster_map#Folium heatmap and MarkerCluster gave a clearer visuals on where the crime clusters are as compared to Treemap
