In [1]:
import pandas as pd
crime_data = pd.read_csv('http://cluster.earlham.edu/~apc0013/Chicago_crime.csv')

In [None]:
crime_data.head()

In [None]:
crime_data.dtypes


In [None]:
crime_data.shape

 ### Graphs 
 
 #### Graph 1 : %crimes by location description (10 most common locations) 
 #### Graph 2 : %crimes by crime type (10 most common crime types) 
 #### Graph 3 : %crimes by Description 
 #### Graph 4 : Most common Locations based on type of crime for top ten highest occuring crimes
 #### Graph 6: Top ten places that an arrest is likely to occur in Chicago

In [None]:
# Graph 1 : %crimes by location description (10 most common locations) 

import matplotlib.pyplot as plt
%matplotlib inline
percent = crime_data['Location Description'].value_counts(normalize=True).apply(lambda x: 100*x)[:15]
fig = plt.figure(figsize=(10,6)) 
ax = fig.gca()     
percent.plot.bar(ax = ax) 
ax.set_title('Percentage of crimes by Location Description for 15 most common locations') 
ax.set_xlabel('Location Description')
ax.set_ylabel('Percentage of Crimes')


In [None]:
# Graph 2 : %crimes by crime type (10 most common crime types) 
percent = crime_data['Primary Type'].value_counts(normalize=True).apply(lambda x: 100*x)[:15]
fig = plt.figure(figsize=(10,6)) 
ax = fig.gca()     
percent.plot.bar(ax = ax) 
ax.set_title('Percentage of crimes by Primary Type for 15 most common types') 
ax.set_xlabel('Primary type of Crime')
ax.set_ylabel('Percentage of Crimes')

In [None]:
# Graph 3 : %crimes by Description 

percent = crime_data['Description'].value_counts(normalize=True).apply(lambda x: 100*x)[:15]
fig = plt.figure(figsize=(10,6)) 
ax = fig.gca()     
percent.plot.bar(ax = ax) 
ax.set_title('Percentage of crimes by Description for 15 most common Descriptions') 
ax.set_xlabel('Crime Descrption')
ax.set_ylabel('Percentage of Crimes')

In [None]:
# Graph 4 : Most common Locations based on type of crime for top ten highest occuring crimes
import numpy as np
import seaborn as sns
location_by_type  = crime_data.pivot_table(values='ID', index='Location Description', columns='Primary Type', aggfunc=np.size).fillna(0)
columns =  location_by_type.sum().sort_values(ascending=False).keys()[:5]

def plot_pivot_location_by_type(pivot_table,row):
    fig = plt.figure(figsize=(10,6)) 
    ax = fig.gca()
    pivot_table[row].sort_values(ascending= False)[:10].plot(kind='barh').legend()
   

for col in columns:
    plot_pivot_location_by_type(location_by_type,col)


    

In [None]:
#Graph 6: Top ten places that an arrest is likely to occur in Chicago
foo = crime_data[['Location Description','Arrest']].groupby('Location Description').sum(axis=1).sort_values(by = 'Arrest',ascending = False)[:10]
foo.plot(kind='barh')


In [None]:
# crimes[['Year','Primary Type']].groupby('Primary Type').count()
year = crime_data['Year']
arrests_by_year_type = crime_data[crime_data['Year']>2015]
arrests_by_year_type = arrests_by_year_type.pivot_table('Arrest',index = 'Year',columns = 'Primary Type',aggfunc=np.size)
columns =  arrests_by_year_type.sum().sort_values(ascending=False).keys()[:5]

def plot_pivot_location_by_type(pivot_table,row):
    fig = plt.figure(figsize=(10,6)) 
    ax = fig.gca()
    pivot_table[[row]].sort_values(ascending= False)[:10].plot(kind='barh')

for col in columns:
    plot_pivot_location_by_type(arrests_by_year_type.unstack(),col)





### Data Processing

In [2]:
#picking primary types with > 5000 samples
primary_type = crime_data['Primary Type'].value_counts()[:23].keys()
crime_data = crime_data.loc[crime_data['Primary Type'].isin(primary_type)] 


In [3]:
# Converting 'Date' to datetime format
date_format = '%m/%d/%Y %I:%M:%S %p'
crime_data['Date'] = pd.to_datetime(crime_data['Date'].astype(str), format =date_format)

In [4]:
#checking number of features by category
categorical = ['District','Description','Location Description']

print('number of categorical datapoints by category')
for cat in categorical:    
    print(cat + " : "+ str(len(crime_data[cat].value_counts())) )



number of categorical datapoints by category
District : 24
Description : 350
Location Description : 179


In [None]:
#Encoding categorical features
# categorical = ['District','Description','Location Description']
# prefixes =  ['District_','Description_','Location Description_']
# dummies = pd.get_dummies(crime_data, columns= categorical, prefix = prefixes)



In [None]:
# Data processing 

unused = ['Beat','Community Area','Case Number','District','FBI Code','ID','Location','Updated On','Ward']
text_features = ['Description','Location Description']


features = [col_name for col_name in crime_data if (col_name != 'Primary Type') and (col_name not in unused)]


crime_data_non_categorical = crime_data[[col for col in crime_data if col not in categorical]]

crime_data = pd.concat([crime_data_non_categorical, dummies], axis=1) 

In [None]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn import metrics
# Model Functions 
def evaluate_model(model,X_test,y_test):
    y_predicted = model.predict(X_test)
    model_score = model.score(X_test,y_test) 
    recall_score = metrics.recall_score(y_test,y_predicted)
    accuracy_score = metrics.accuracy_score(y_test,y_predicted)
    precision_score = metrics.precision_score(y_test,y_predicted)   
    
    print('model score : ' + str(model_score) )
    print('accuracy score : ' + str(accuracy_score))
    print('precision score : ' + str(precision_score)  )
    print('recall score : ' + str(recall_score))
    print(metrics.confusion_matrix(y_test,y_predicted))

def test_model(model,features):
    X = crime_data[features]
    y = crime_data['Primary Type']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    model.fit(X_train,y_train)
    evaluate_model(model,X_test,y_test)   

In [None]:
# Model : Random Forest




X = crime_data[features]
y = crime_data['Primary Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


# test_model(random_forest,features)

In [None]:
from sklearn import ensemble
random_forest = ensemble.RandomForestClassifier()

random_forest.fit(X_train,y_train)



In [None]:

for col in X:    
    print(crime_data[[col]].sample(n=1))


In [12]:
crime_data['Description'].value_counts()[:98].sort_values()


CYCLE, SCOOTER, BIKE W-VIN                   5049
VEHICULAR HIJACKING                          5107
HOME INVASION                                5142
PAROLE VIOLATION                             5272
OBSTRUCTING IDENTIFICATION                   5559
AGG CRIMINAL SEXUAL ABUSE                    5666
TO STATE SUP PROP                            5780
RESIST/OBSTRUCT/DISARM OFFICER               5828
BOMB THREAT                                  6604
ENDANGERING LIFE/HEALTH CHILD                6763
OTHER CRIME AGAINST PERSON                   7052
LICENSE VIOLATION                            7064
PUBLIC INDECENCY                             7327
BY FIRE                                      7349
OTHER WEAPONS VIOLATION                      7372
COUNTERFEIT CHECK                            7678
VEHICLE TITLE/REG OFFENSE                    7796
FINANCIAL IDENTITY THEFT $300 AND UNDER      8008
ATTEMPT: ARMED-HANDGUN                       8366
UNLAWFUL USE OTHER DANG WEAPON               8506
