In [1]:
from scipy import stats
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
%matplotlib inline

### data prep 

In [2]:
crime_data=pd.read_csv("Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv")

In [3]:
# for part 3.All part uses data for the period 2010-2018 (remember to filter your data)

crime_data['Date1'] = pd.to_datetime(crime_data['Date'])  
crime_data2= crime_data[(crime_data.Date1.dt.year >= 2010) & (crime_data.Date1.dt.year <= 2018)]
crime_data1= crime_data2[(crime_data2.Category.isin(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT']))] 

crime_data1

Unnamed: 0,IncidntNum,Category,Descript,DayOfWeek,Date,Time,PdDistrict,Resolution,Address,X,Y,Location,PdId,Date1
1,150045675,ASSAULT,BATTERY,Thursday,01/15/2015,17:00,TARAVAL,NONE,1800 Block of VICENTE ST,-122.485604,37.738821,POINT (-122.48560378101 37.7388214326705),15004567504134,2015-01-15
3,150383259,ASSAULT,BATTERY,Saturday,05/02/2015,23:10,BAYVIEW,"ARREST, BOOKED",2400 Block of PHELPS ST,-122.400131,37.730093,POINT (-122.400130573297 37.7300925390327),15038325904134,2015-05-02
9,111027676,ASSAULT,BATTERY,Saturday,12/24/2011,07:00,SOUTHERN,NONE,0 Block of DORE ST,-122.412933,37.773927,POINT (-122.412933062384 37.7739274524819),11102767604134,2011-12-24
18,120444392,ASSAULT,BATTERY,Tuesday,06/05/2012,11:47,RICHMOND,NONE,500 Block of ARGUELLO BL,-122.458725,37.780280,POINT (-122.458724812805 37.7802795296617),12044439204134,2012-06-05
21,180210729,ASSAULT,BATTERY,Tuesday,03/20/2018,18:20,NORTHERN,NONE,POST ST / GOUGH ST,-122.424860,37.786196,POINT (-122.42485988902867 37.78619644725339),18021072904134,2018-03-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2215014,170126722,ASSAULT,BATTERY,Monday,02/13/2017,19:50,NORTHERN,NONE,MCALLISTER ST / FRANKLIN ST,-122.421893,37.779891,POINT (-122.4218931344669 37.77989123361396),17012672204134,2017-02-13
2215015,160511161,LARCENY/THEFT,PETTY THEFT OF PROPERTY,Friday,06/24/2016,10:10,TARAVAL,NONE,500 Block of JUDAH ST,-122.467791,37.762008,POINT (-122.46779091121964 37.76200811098795),16051116106372,2016-06-24
2215017,160442986,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,05/30/2016,19:10,CENTRAL,NONE,BAY ST / POWELL ST,-122.411953,37.805824,POINT (-122.41195308361146 37.80582429492737),16044298606244,2016-05-30
2215018,166167891,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Friday,07/22/2016,09:30,PARK,NONE,14TH ST / NOE ST,-122.433409,37.767460,POINT (-122.43340929426132 37.767459890623755),16616789106244,2016-07-22


# Part 1: Questions to text and lectures.

# Part 2: Random forest and weather

## Part 2A: Random forest binary classification

The two types of crimes we will be focusing on here is *fraud* and *theft*.

In [4]:
fraud = crime_data[crime_data.Category=='FRAUD']
theft = crime_data[crime_data.Category=='LARCENY/THEFT']
print("Number of fraud crimes: {}".format(len(fraud)))
print("Number of theft crimes: {}".format(len(theft)))

Number of fraud crimes: 41542
Number of theft crimes: 480448


Because there is such a large different in the number of occurences of the two crime types, we will start by balancing the data set. This is here done by downsampling the number of theft crimes.<br>
The following code is based on the example provided here: https://elitedatascience.com/imbalanced-classes

In [5]:
from sklearn.utils import resample

In [6]:
theft= resample(theft,replace=False,n_samples=len(fraud),random_state=123)
crime_data3 = pd.concat([theft, fraud])

The next step is to selct the desired features.The first features focus on the "where", where the following features will be included.<br>
- **police district**: This will be represented by using a one-hot encoding of *PdDistrict*, as there exist no hiracical relationship between different districts.
- **X**: the lattitude of the location of the crime. No transformation is applied.
- **Y**: The longitude of the location of the crime. No transformation is applied.

In [7]:
crime_data3=crime_data3.drop(['IncidntNum','Descript','Resolution','Address'],axis=1);

In [8]:
crime_data3=crime_data3.join(pd.get_dummies(crime_data3.PdDistrict))
crime_data3=crime_data3.drop(['PdDistrict','PdId','Location'],axis=1)


The second round of features focus on the "when", where the following features will be included.<br>
- **Month**: This is obtained from *Date* attribute.
- **Hour of the week**: This is obtained by combining the information from the *Time* and *Weekday* attributes of the original data set.

In [9]:
# adding month
crime_data3['Month']=pd.DatetimeIndex(crime_data3.Date1).month
crime_data3=crime_data3.drop(['Date'],axis=1)

In [10]:
# transform day of week to integer values where the order of days correspond to the order of indexes.
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
sorterIndex = dict(zip(days,range(len(days))))
crime_data3['DayOfWeek'] = crime_data3['DayOfWeek'].map(sorterIndex)+1
crime_data3['hourOfTheWeek'] = pd.to_datetime(crime_data3.Time).dt.hour+1 +24*crime_data3['DayOfWeek']
crime_data3=crime_data3.drop(['DayOfWeek'],axis=1)

The data set is now devided into a test and training set. The split is made using *train_test_split* from sklean where the data is set to be split so that the two classes will be distributed equally in both the train and test set.

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(crime_data3, crime_data3['Category'], test_size=0.3, random_state=0,stratify=crime_data3['Category'])

In [12]:
print("Number of training samples: {}".format(len(X_train)))
print("Number of training samples: {}".format(len(X_test)))

Number of training samples: 37854
Number of training samples: 24926


We are now ready to build the random classifier it self.<br>
The classifier is trained without cross-validation, as it is expected that the size of the data set is large enough that part of the data can be completely taken out as test data. The max_depth of the created trees are set to 25 to limit the trees in becomming to overfitted to the training data. However the metod itself should help prevent a large degree of overfitting.

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFClassifier = RandomForestClassifier(random_state=3,criterion="entropy",max_depth=25)
RFClassifier=RFClassifier.fit(X_train.drop(["Date1","Time","Category"],axis=1),y_train) 
# The date object it self is not used in the classefier but is preserved so it can be merged with the weather data.
print("accuracy: {:.2f}%".format(RFClassifier.score(X_test.drop(["Date1","Time","Category"],axis=1), y_test)*100))

Based on the on the accuracy we can see that the randomforest-classifier does perform better than the baseline 50% describing random guessing. However it is also clear that it is not a very precis classifier, this could indicate that two crime types don't have very distinct spatio-temporal features.

## Part 2B: Info from weather features

We start by downloading the data.

In [None]:
import requests
file=requests.get('https://raw.githubusercontent.com/suneman/socialdataanalysis2020/master/files/Data_files/weather_data.csv')
open('weather_data.csv', 'wb').write(file.content)
weather_data=pd.read_csv("weather_data.csv")

The weather data contains the following features.

In [None]:
weather_data.columns

- **date:** The date information will be used to add the data to the correct crime observations. However any information concerning the time of the crime have already been included in the features selected from the crime data set.
- **temperature:** The temperature information will be keept, and since it is already given as decimal numbers no transformation will be applied.
- **humidity:** The humidity will also be included, as it might be an indication of how comfortable it is to be outside. No transformation will be apllied as it is listed as decimal numbers.  
- **weather:** The weather feature describe the overall weather and is therefor an essential feature. However the feature is categorical and will therefore be one-hot encoded.
- **wind_speed:** The wind speed will be included, without any form of transformation.  
- **wind_direction:** The wind speed direction will not be included as it does not seem likely to be connected to what type of crime that will be happening.
- **pressure:** The pressure is not expected to influence human behaviour and is therefore not included.

In [None]:
# transforming weather feature
weather_data=weather_data.join(pd.get_dummies(weather_data.weather))
weather_data=weather_data.drop(['weather','wind_direction','pressure'],axis=1)

The weather information is now added to the dataset.

In [None]:
#transformation so date and time information format match in both data frames.
weather_data['date']=pd.to_datetime(weather_data['date'])
weather_data['hour']=weather_data['date'].dt.hour
weather_data['date']=weather_data['date'].dt.date
X_test['Time']=pd.to_datetime(X_test.Time).dt.hour
X_test['Date1']=pd.to_datetime(X_test.Date1).dt.date
X_train['Time']=pd.to_datetime(X_train.Time).dt.hour
X_train['Date1']=pd.to_datetime(X_train.Date1).dt.date

#mergeing data
X_test=X_test.merge(weather_data, how='inner', left_on=["Date1", "Time"], right_on=["date","hour"])
y_test=X_test.Category
X_test=X_test.drop(['Time','hour','date','Date1','Category'],axis=1)
X_train=X_train.merge(weather_data, how='inner', left_on=["Date1", "Time"], right_on=["date","hour"])
y_train=X_train.Category
X_train=X_train.drop(['Time','hour','date','Date1','Category'],axis=1);

We will now just check the size of the training and test set, after the weather information have been added.

In [None]:
print("Number of training samples: {}".format(len(X_train)))
print("Number of test samples {}".format(len(X_test)))

We are now ready to train our random forrest classifier with the additional weather features.

In [None]:
WeatherRFClassifier = RandomForestClassifier(random_state=3,criterion="entropy",max_depth=25)
WeatherRFClassifier=WeatherRFClassifier.fit(X_train,y_train) 
# The date object it self is not used in the classefier but is preserved so it can be merged with the weather data.
print("accuracy: {:.2f}%".format(WeatherRFClassifier.score(X_test, y_test)*100))

We can see that the performance was only minimally improved by introducing the weather data, with the previous accuracy being 60.13%.<br> 
What this only very small performance increase indicate is that the weather does not affect the occurences of *theft* and *fraud* in very different ways, and it the information does therefor not appear paticularly usefuel when it comes to predicting or classifing crime types. However it is still possible that weather information could influence the overall crime activity, and therefore might be use for predicting number of expected crimes.<br>

# Part 3: Data visualization

In [None]:
crime_data1['New_Time'] = pd.to_datetime(crime_data1['Time']).dt.hour
crime_focus_count_hour = crime_data1.groupby(['New_Time','Category'])['IncidntNum'].count()
crime_focus_count_hour

In [None]:
new_df = crime_focus_count_hour.unstack() 
cols = list(new_df.columns) 
new_df[cols] = new_df[cols].div(new_df[cols].sum(axis=1), axis=0) 
new_df   ## prepare data to hour distributed dataframe

In [None]:
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource,FactorRange
from bokeh.plotting import figure
from bokeh.palettes import viridis
from bokeh.models import Legend
output_notebook()

In [None]:
src = ColumnDataSource(new_df)
src

In [None]:

p = figure(
            title='crime distribution',
            x_range=FactorRange(factors = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23"]),
            plot_width=1000,plot_height=500,
            y_axis_label="count per hour/total counts",
            x_axis_label="hours",
            tools="pan,wheel_zoom,box_select,reset,save"
        )


In [None]:
bar ={}
items = []
focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])
bar_colors=viridis(len(focuscrimes))
for indx,i in enumerate(focuscrimes):
         bar[i] = p.vbar(x='New_Time', top=i, source=src,width=0.5, color = bar_colors[indx] , muted_alpha = 0.6, alpha = 0.8,muted = False) 
         items.append((i, [bar[i]]))
         legend = Legend(items=items, location=(0, 0))  

p.add_layout(legend, 'right')
show(p)       

### Explanation of code

The goal here is to make an interactive visualization of our chosen crime sub-dataset from 2010 to 2018.We normalized our data by each crime counts per hour divivded by their total counts. We simply groupby the dataset by the hours and their categories and unstack it. 
The the code below perform the normalization part, which could be found [here](https://stackoverflow.com/questions/42006346/pandas-convert-columns-to-percentages-of-the-totals)
```python
new_df[cols] = new_df[cols].div(new_df[cols].sum(axis=1), axis=0) 
```
Afterward we using bokeh package to make the plot.We create an empty figure and define the tools by bokeh.plotting.figure. Here the x-range is our x-axis from 0'o clock to 23'o clock.
Finally,we filling the figure p.vbar
```python
bar_colors= bokeh.palettes.viridis(len(focuscrimes))
color = bar_colors[indx]
# it will set to each crime type perpectively.
```
By following the steps from part 2 of week 8,we also fix the positon of legends.