In [None]:
#Importing all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import KMeans, DBSCAN
%matplotlib inline
sns.set_style("darkgrid")

import warnings
warnings.simplefilter('ignore')

In [None]:
# import os

# Specify the directory path
files = '/content/drive/MyDrive/cognorise/Data analytics/7.prjct/Chicago_Crimes_2008_to_2011.csv'

# List the files in the directory
# files = os.listdir(directory_path)
print(files)

In [None]:
crimes_data = pd.read_csv('/content/drive/MyDrive/cognorise/Data analytics/7.prjct/Chicago_Crimes_2012_to_2017.csv')

In [None]:
#Checking the data contents
crimes_data.head()


In [None]:
#Handling any inconsistensis of column names
crimes_data.columns = crimes_data.columns.str.strip()
crimes_data.columns = crimes_data.columns.str.replace(',', '')
crimes_data.columns = crimes_data.columns.str.replace(' ', '_')
crimes_data.columns = crimes_data.columns.str.lower()

In [None]:
#Checking the data for any null values and its datatypes
crimes_data.info()

In [None]:
#Check the data forany duplicates
crimes_data[crimes_data.duplicated(keep=False)]

In [None]:
# Removing Primary key type attriburtes as they of no use for any type of analysis, Location columns is just a
# combination of Latitude and Longitude
crimes_data.drop(['id','case_number','location'],axis=1,inplace=True)

In [None]:
msno.heatmap(crimes_data,figsize=(15, 5))


In [None]:
msno.dendrogram(crimes_data,figsize=(20,5))

In [None]:
crimes_data.isnull().sum()

In [None]:
crimes_data.isnull().sum()

In [None]:
crimes_data.dropna(inplace=True)
crimes_data.reset_index(drop=True,inplace=True)

In [None]:
crimes_data.info()

In [None]:
#Converting the data column to datetime object so we can get better results of our analysis
#Get the day of the week,month and time of the crimes
crimes_data.date = pd.to_datetime(crimes_data.date)
crimes_data['day_of_week'] = crimes_data.date.dt.day_name()
crimes_data['month'] = crimes_data.date.dt.month_name()
crimes_data['time'] = crimes_data.date.dt.hour

In [None]:
#Mapping similar crimes under one group.
primary_type_map = {
    ('BURGLARY','MOTOR VEHICLE THEFT','THEFT','ROBBERY') : 'THEFT',
    ('BATTERY','ASSAULT','NON-CRIMINAL','NON-CRIMINAL (SUBJECT SPECIFIED)') : 'NON-CRIMINAL_ASSAULT',
    ('CRIM SEXUAL ASSAULT','SEX OFFENSE','STALKING','PROSTITUTION') : 'SEXUAL_OFFENSE',
    ('WEAPONS VIOLATION','CONCEALED CARRY LICENSE VIOLATION') :  'WEAPONS_OFFENSE',
    ('HOMICIDE','CRIMINAL DAMAGE','DECEPTIVE PRACTICE','CRIMINAL TRESPASS') : 'CRIMINAL_OFFENSE',
    ('KIDNAPPING','HUMAN TRAFFICKING','OFFENSE INVOLVING CHILDREN') : 'HUMAN_TRAFFICKING_OFFENSE',
    ('NARCOTICS','OTHER NARCOTIC VIOLATION') : 'NARCOTIC_OFFENSE',
    ('OTHER OFFENSE','ARSON','GAMBLING','PUBLIC PEACE VIOLATION','INTIMIDATION','INTERFERENCE WITH PUBLIC OFFICER','LIQUOR LAW VIOLATION','OBSCENITY','PUBLIC INDECENCY') : 'OTHER_OFFENSE'
}
primary_type_mapping = {}
for keys, values in primary_type_map.items():
    for key in keys:
        primary_type_mapping[key] = values
crimes_data['primary_type_grouped'] = crimes_data.primary_type.map(primary_type_mapping)

In [None]:
#Zone where the crime has occured
zone_mapping = {
    'N' : 'North',
    'S' : 'South',
    'E' : 'East',
    'W' : 'West'
}
crimes_data['zone'] = crimes_data.block.str.split(" ", n = 2, expand = True)[1].map(zone_mapping)

In [None]:
#Mapping seasons from month of crime
season_map = {
    ('March','April','May') : 'Spring',
    ('June','July','August') : 'Summer',
    ('September','October','November') : 'Fall',
    ('December','January','February') : 'Winter'
}
season_mapping = {}
for keys, values in season_map.items():
    for key in keys:
        season_mapping[key] = values
crimes_data['season'] = crimes_data.month.map(season_mapping)

In [None]:
#Mapping similar locations of crime under one group.
loc_map = {
    ('RESIDENCE', 'APARTMENT', 'CHA APARTMENT', 'RESIDENCE PORCH/HALLWAY', 'RESIDENCE-GARAGE',
    'RESIDENTIAL YARD (FRONT/BACK)', 'DRIVEWAY - RESIDENTIAL', 'HOUSE') : 'RESIDENCE',

    ('BARBERSHOP', 'COMMERCIAL / BUSINESS OFFICE', 'CURRENCY EXCHANGE', 'DEPARTMENT STORE', 'RESTAURANT',
    'ATHLETIC CLUB', 'TAVERN/LIQUOR STORE', 'SMALL RETAIL STORE', 'HOTEL/MOTEL', 'GAS STATION',
    'AUTO / BOAT / RV DEALERSHIP', 'CONVENIENCE STORE', 'BANK', 'BAR OR TAVERN', 'DRUG STORE',
    'GROCERY FOOD STORE', 'CAR WASH', 'SPORTS ARENA/STADIUM', 'DAY CARE CENTER', 'MOVIE HOUSE/THEATER',
    'APPLIANCE STORE', 'CLEANING STORE', 'PAWN SHOP', 'FACTORY/MANUFACTURING BUILDING', 'ANIMAL HOSPITAL',
    'BOWLING ALLEY', 'SAVINGS AND LOAN', 'CREDIT UNION', 'KENNEL', 'GARAGE/AUTO REPAIR', 'LIQUOR STORE',
    'GAS STATION DRIVE/PROP.', 'OFFICE', 'BARBER SHOP/BEAUTY SALON') : 'BUSINESS',

    ('VEHICLE NON-COMMERCIAL', 'AUTO', 'VEHICLE - OTHER RIDE SHARE SERVICE (E.G., UBER, LYFT)', 'TAXICAB', 'VEHICLE-COMMERCIAL', 'VEHICLE - DELIVERY TRUCK', 'VEHICLE-COMMERCIAL - TROLLEY BUS',
    'VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS') : 'VEHICLE',

    ('AIRPORT TERMINAL UPPER LEVEL - NON-SECURE AREA', 'CTA PLATFORM', 'CTA STATION', 'CTA BUS STOP',
    'AIRPORT TERMINAL UPPER LEVEL - SECURE AREA', 'CTA TRAIN', 'CTA BUS', 'CTA GARAGE / OTHER PROPERTY',
    'OTHER RAILROAD PROP / TRAIN DEPOT', 'AIRPORT TERMINAL LOWER LEVEL - SECURE AREA',
    'AIRPORT BUILDING NON-TERMINAL - SECURE AREA', 'AIRPORT EXTERIOR - NON-SECURE AREA', 'AIRCRAFT',
    'AIRPORT PARKING LOT', 'AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA', 'OTHER COMMERCIAL TRANSPORTATION',
    'AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA', 'AIRPORT VENDING ESTABLISHMENT',
    'AIRPORT TERMINAL MEZZANINE - NON-SECURE AREA', 'AIRPORT EXTERIOR - SECURE AREA', 'AIRPORT TRANSPORTATION SYSTEM (ATS)',
    'CTA TRACKS - RIGHT OF WAY', 'AIRPORT/AIRCRAFT', 'BOAT/WATERCRAFT', 'CTA PROPERTY', 'CTA "L" PLATFORM',
    'RAILROAD PROPERTY') : 'PUBLIC_TRANSPORTATION',
      ('HOSPITAL BUILDING/GROUNDS', 'NURSING HOME/RETIREMENT HOME', 'SCHOOL, PUBLIC, BUILDING',
    'CHURCH/SYNAGOGUE/PLACE OF WORSHIP', 'SCHOOL, PUBLIC, GROUNDS', 'SCHOOL, PRIVATE, BUILDING',
    'MEDICAL/DENTAL OFFICE', 'LIBRARY', 'COLLEGE/UNIVERSITY RESIDENCE HALL', 'YMCA', 'HOSPITAL') : 'PUBLIC_BUILDING',

    ('STREET', 'PARKING LOT/GARAGE(NON.RESID.)', 'SIDEWALK', 'PARK PROPERTY', 'ALLEY', 'CEMETARY',
    'CHA HALLWAY/STAIRWELL/ELEVATOR', 'CHA PARKING LOT/GROUNDS', 'COLLEGE/UNIVERSITY GROUNDS', 'BRIDGE',
    'SCHOOL, PRIVATE, GROUNDS', 'FOREST PRESERVE', 'LAKEFRONT/WATERFRONT/RIVERBANK', 'PARKING LOT', 'DRIVEWAY',
    'HALLWAY', 'YARD', 'CHA GROUNDS', 'RIVER BANK', 'STAIRWELL', 'CHA PARKING LOT') : 'PUBLIC_AREA',

    ('POLICE FACILITY/VEH PARKING LOT', 'GOVERNMENT BUILDING/PROPERTY', 'FEDERAL BUILDING', 'JAIL / LOCK-UP FACILITY',
    'FIRE STATION', 'GOVERNMENT BUILDING') : 'GOVERNMENT',
      ('OTHER', 'ABANDONED BUILDING', 'WAREHOUSE', 'ATM (AUTOMATIC TELLER MACHINE)', 'VACANT LOT/LAND',
    'CONSTRUCTION SITE', 'POOL ROOM', 'NEWSSTAND', 'HIGHWAY/EXPRESSWAY', 'COIN OPERATED MACHINE', 'HORSE STABLE',
    'FARM', 'GARAGE', 'WOODED AREA', 'GANGWAY', 'TRAILER', 'BASEMENT', 'CHA PLAY LOT') : 'OTHER'
}

loc_mapping = {}
for keys, values in loc_map.items():
    for key in keys:
        loc_mapping[key] = values
crimes_data['loc_grouped'] = crimes_data.location_description.map(loc_mapping)

In [None]:
#Mapping crimes to ints to get better information from plots
crimes_data.arrest = crimes_data.arrest.astype(int)
crimes_data.domestic = crimes_data.domestic.astype(int)

In [None]:
#Grouping the data into years = (2018 and 2019) for analyzing
crimes_data_2017 = crimes_data[crimes_data.year == 2017]
crimes_data_2016 = crimes_data[crimes_data.year == 2016]

In [None]:
plt.figure(figsize=(15,5))
zone_plot = sns.countplot(data=crimes_data,x='day_of_week',hue='year',order=crimes_data.day_of_week.value_counts().index,palette='Set2')

From the above plot we can see that the day of the week has very little influence on the crime, it seems like almost everyday the crimes count were the same. But when it comes to 2018 and 2019 we can see that the number of crimes are less in 2019 than in 2018



In [None]:
plt.figure(figsize=(20,5))
zone_plot = sns.countplot(data=crimes_data,x='month',hue='year',order=crimes_data.month.value_counts().index,palette='Set2')

From the above plot we can see that May, June, July and August are the months were there are huge crime rates. One thing is also to be noted here is that for the year 2019 there is no value for December is 0 since we dont have the data available yet and also november data is also very limited. During February,January and November the crimes are less compared. It looks like the crime rate were low in winter's where people hardly come out of the houses.

In [None]:
# Calculate value counts for time
time_counts = crimes_data_2017.time.value_counts()

# Create the pointplot using the index and values of the time_counts Series
zone_plot = sns.pointplot(x=time_counts.index, y=time_counts.values)

The above plot gives the crimes that happened by hour of the day for the year 2018. Around midnight we can see there are a lot of crimes happening and it decreses gradually. There is a spike at 12 in the morning. We need to analyze more looking at the data as to why its happening. The crimes are also more around the evening.

We can see a similar trend in the year 2019 as well.

In [None]:
crimes_data_primary_type_pie = plt.pie(crimes_data_2017.primary_type_grouped.value_counts(),labels=crimes_data_2017.primary_type_grouped.value_counts().index,autopct='%1.1f%%',radius=2.5)
plt.legend(loc = 'best')

By looking at the pie charts, we can see that there are a lot of Thefts and Non- Criminal Assault that happeened in the year 2019, there were very less sexual and weapons offence. Theft and Non-Criminal Offence consitituted about 50% of the offence's

There is a same trend in the year 2019 as well, but the Thefts were a bit reduced but the count of the Non-Criminal Offence went up

In [None]:
crimes_data_primary_type_pie = plt.pie(crimes_data_2017.loc_grouped.value_counts(),labels=crimes_data_2017.loc_grouped.value_counts().index,autopct='%1.1f%%',shadow=True,radius=2.5)
plt.legend(loc = 'best')

From the above plot we can see that a lot of crimes happenen in pulic places and residence.Almost 60% of the incidents happened in these places. Public transportation and Government places had very less crimes, so we can deploy more forces in resedential areas to reduce the crime rate.

The below plot of the year 2019 shows a same trend as in 2018

In [None]:
plt.figure(figsize=(20,3))
# Calculate value counts and store in a variable
top_20_crimes = crimes_data_2017.primary_type.value_counts()[0:20]

# Pass the index and values of the top 20 crimes to x and y
primary_type_plot_2017 = sns.barplot(x=top_20_crimes.index, y=top_20_crimes.values, palette='Set2')
plt.xticks(rotation=45)

plt.figure(figsize=(20,3))
# Repeat for 2016 data
top_20_crimes_2016 = crimes_data_2016.primary_type.value_counts()[0:20]
primary_type_plot_2016 = sns.barplot(x=top_20_crimes_2016.index, y=top_20_crimes_2016.values, palette='Set2')
plt.xticks(rotation=45)

We see from the above plots that there were a lot of THEFT and BATTERY related crimes that happened over the last two years.

From the below plot we can see that the East side of the city experienced fewer crimes. South Side had the greatest number of crimes.? Reason. Has to be found out. Is it because of the type of citizens? Or are there more beats in the east zone compared to south zone

In [None]:
zone_plot = sns.countplot(data=crimes_data,x='zone',hue='year',order=crimes_data.zone.value_counts().index,palette='Set2')

We can see from the below plot that a lot of crimes happening in Summer and Spring, where the weather conditions are favourable. There are very less crimes comparably in Winters

In [None]:
zone_plot = sns.countplot(data=crimes_data,x='season',hue='year',palette='Set2')

As we see from the below plot,there were a lot of criminals who were not arrested for the crime. The same trend has continued for 2019 as well. Almost 75% of the criminals went un-arrested for their crime in 2018.

In [None]:
arrest_plot = sns.countplot(data=crimes_data,x='year',hue='arrest',palette='Set2')


In [None]:
plt.figure(figsize=(20,3))
# Get the top 20 location descriptions and their counts
top_locations = crimes_data_2017.location_description.value_counts()[0:20]

# Use the index (location descriptions) for x and values (counts) for y
location_description_plot_2017 = sns.barplot(x=top_locations.index, y=top_locations.values, palette='Set2')

plt.xticks(rotation=45)
plt.show()  # Add this line to display the plot

plt.figure(figsize=(20,3))
# Repeat for 2016 data
top_locations = crimes_data_2016.location_description.value_counts()[0:20]
location_description_plot_2016 = sns.barplot(x=top_locations.index, y=top_locations.values, palette='Set2')

plt.xticks(rotation=45)
plt.show()  # Add this line to display the plot


By Looking at the two plots for the years 2018 and 2019 for the location of crime, we can see that the trend continues, A lot of street and Residential crimes are happening in the city. The trend slowly fades out.

**Complete Data**

In [None]:
crimes_data_primary_type_pie = plt.pie(crimes_data.primary_type_grouped.value_counts(),labels=crimes_data.primary_type_grouped.value_counts().index,autopct='%1.1f%%',shadow=True,radius=2.5)
plt.legend(loc = 'best')

In [None]:
crimes_data_primary_type_pie = plt.pie(crimes_data.loc_grouped.value_counts(),labels=crimes_data.loc_grouped.value_counts().index,autopct='%1.1f%%',shadow=True,radius=2.5)
plt.legend(loc = 'best')

In [None]:
crimes_data_primary_type_pie = plt.pie(crimes_data.loc_grouped.value_counts(),labels=crimes_data.loc_grouped.value_counts().index,autopct='%1.1f%%',shadow=True,radius=2.5)
plt.legend(loc = 'best')

In [None]:
plt.figure(figsize=(15,5))
zone_plot = sns.countplot(data=crimes_data,x='day_of_week',order=crimes_data.day_of_week.value_counts().index,palette='Set2')

In [None]:
plt.figure(figsize=(20,5))
zone_plot = sns.countplot(data=crimes_data,x='month',order=crimes_data.month.value_counts().index,palette='Set2')

In [None]:
plt.figure(figsize=(20,5))
# Group the data and calculate counts for each time
time_counts = crimes_data['time'].value_counts().sort_index()

# Create the point plot
zone_plot = sns.pointplot(x=time_counts.index, y=time_counts.values)
plt.xticks(rotation=45)  # Rotate x-axis labels for readability
plt.show()

In [None]:
zone_plot = sns.countplot(data=crimes_data,x='zone',order=crimes_data.zone.value_counts().index,palette='Set2')

In [None]:
zone_plot = sns.countplot(data=crimes_data,x='season',order=crimes_data.season.value_counts().index,palette='Set2')

In [None]:
arrest_plot = sns.countplot(data=crimes_data,x='arrest',palette='Set2')

In [None]:
plt.figure(figsize=(15,3))
arrest_plot = sns.countplot(data=crimes_data,x='arrest',hue='primary_type_grouped',palette='Set2')
plt.legend(loc = 'best')

In [None]:
# Create a DataFrame for the top 20 locations
top_locations = crimes_data.location_description.value_counts()[0:20].reset_index()
top_locations.columns = ['location_description', 'count']

# Plot using the new DataFrame
plt.figure(figsize=(20,3))
location_description_plot = sns.barplot(data=top_locations, x='location_description', y='count', palette='Set2')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(20,3))
# Use the 'top_locations' DataFrame you created earlier
location_description_plot = sns.barplot(data=top_locations, x='location_description', y='count', palette='Set2')
plt.xticks(rotation=45)
plt.show()

In [None]:
new_crimes_data = crimes_data.loc[(crimes_data['x_coordinate'] != 0)]

sns.lmplot(x='x_coordinate',
           y='y_coordinate',
           data=new_crimes_data[:],
           fit_reg=False,
           hue="district",
           palette='Dark2',
           height=12,
           ci=2,
           scatter_kws={"marker": "D", "s": 10})

ax = plt.gca()
ax.set_title("Crimes by District")
plt.show()

In [None]:
new_crimes_data = crimes_data.loc[(crimes_data['x_coordinate'] != 0)]

sns.lmplot(x='x_coordinate',
           y='y_coordinate',
           data=new_crimes_data[:],
           fit_reg=False,
           hue="primary_type_grouped",
           palette='Dark2',
           height=12,
           ci=2,
           scatter_kws={"marker": "o", "s": 10})

ax = plt.gca()
ax.set_title("Crimes by Type of crime")
plt.show()

In [None]:
#Converting the numercial attributes to categorical attributes
crimes_data.year = pd.Categorical(crimes_data.year)
crimes_data.time = pd.Categorical(crimes_data.time)
crimes_data.domestic = pd.Categorical(crimes_data.domestic)
crimes_data.arrest = pd.Categorical(crimes_data.arrest)
crimes_data.beat = pd.Categorical(crimes_data.beat)
crimes_data.district = pd.Categorical(crimes_data.district)
crimes_data.ward = pd.Categorical(crimes_data.ward)
crimes_data.community_area = pd.Categorical(crimes_data.community_area)

In [None]:
crimes_data_prediction = crimes_data.drop(['date','block','iucr','primary_type','description','location_description','fbi_code','updated_on','x_coordinate','y_coordinate'],axis=1)

In [None]:
crimes_data_prediction.head()


In [None]:
crimes_data_prediction.info()


In [None]:
# one-hot encoding
crimes_data_prediction = pd.get_dummies(crimes_data_prediction,drop_first=True)

In [None]:
crimes_data_prediction.head()

In [None]:
crimes_data_prediction.info()

In [None]:
testing_dataset = crimes_data_prediction.sample(n=20)

In [None]:
testing_dataset.info()

In [None]:
testing_dataset.to_csv('testing_crime_data.csv')

In [None]:
#Train test split with a test set size of 30% of entire data
X_train, X_test, y_train, y_test = train_test_split(crimes_data_prediction.drop(['arrest_1'],axis=1),crimes_data_prediction['arrest_1'], test_size=0.3, random_state=42)

In [None]:
#Standardizing the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
type(X_test)

In [None]:
test_df = pd.DataFrame(X_test)
test_dataset = test_df.sample(n=20)
test_dataset.to_csv('test_crime_data.csv')

In [None]:
conda install scikit-learn

In [None]:
#Gaussain Naive Bayes
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Precision = ',metrics.precision_score(y_test, y_pred,))
print('Recall = ',metrics.recall_score(y_test, y_pred))
print('F-1 Score = ',metrics.f1_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))

In [None]:
#Decision tree with Entropy as attribute measure
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
#Classification Metrics
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Precision = ',metrics.precision_score(y_test, y_pred,))
print('Recall = ',metrics.recall_score(y_test, y_pred))
print('F-1 Score = ',metrics.f1_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))

In [None]:
#Random Forest classifier  - Best one
model = RandomForestClassifier(n_estimators = 10,criterion='entropy',random_state=42)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
#Classification Metrics
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Precision = ',metrics.precision_score(y_test, y_pred,))
print('Recall = ',metrics.recall_score(y_test, y_pred))
print('F-1 Score = ',metrics.f1_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))

In [None]:
#Logistic Regression
classifier = LogisticRegression(random_state=42)

In [None]:
classifier.fit(X_train,y_train)

In [None]:
y_test_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_test = accuracy_score(y_test_pred,y_test)
print(accuracy_test)

In [None]:
test2_df = pd.DataFrame(X_test)
test2_dataset = test2_df.sample(n=20)
test2_dataset.to_csv('test2_crime_data.csv')

In [None]:
input_data = (-0.08612261954017632,2,34,1000,-0.0727424268210971,-0.058185317300957384,-0.05306572378045673,-0.051575306687303944,-0.07305660485847232,-0.0671514489565111,100,-0.059537876771199524,-0.05089351982321086,-0.05654164336300329,-0.006426255403158872,-0.052970202698082725,-0.06342511326400732,-0.05543742225627254,-0.049817999184121574,-0.045641911023108085,-0.05043392874257892,-0.06170530053266052,-0.05425504373496812,-0.05170276333828677,-0.051319453844544415,-0.054886315827578726,-0.05382385845036992,-0.04511818514836309,-0.05090346532186001,-0.03965713910874886,-0.049970154120237954,-0.0708646162494429,-0.05900739814163445,-0.05624507276526327,-0.07511826176324378,-0.06519376756644704,-0.06997625939589482,-0.07064175364182954,-0.07104384506711912,-0.06865591769330563,-0.05618196514207206,-0.06080242005164967,-0.06463046314267809,-0.07013596248769292,-0.06493618276751377,-0.08159214384856699,-0.09040565993225347,-0.06610675633831176,-0.08789774041722336,-0.06901043162131001,-0.0684482859643018,-0.06393535351527828,-0.056218035206214566,-0.04703891583264338,-0.08496440100379914,-0.0738742383401294,-0.07370865898784505,-0.07307750305062292,-0.07145795938073327,-0.0674760692166688,-0.061358951470341226,-0.05815917017443488,-0.06821023312827618,-0.06932650662488712,-0.07849513136297503,-0.06741579096770389,-0.0610605408183104,-0.07979659374817424,-0.06003839577005453,-0.07283332562845732,-0.08424589616698659,-0.07701362442722773,-0.07579329185036911,-0.05553787243758409,-0.0606854948361675,-0.06128448299624821,-0.0744714442972226,-0.07412192886355813,-0.05439491724360849,-0.052354787299564394,-0.05459942451066697,-0.06704542558894501,-0.056916890286373416,-0.06865591769330565,-0.06008062370928079,-0.059256108121846555,-0.05540999553593467,-0.06589124004080463,-0.05722774063777346,-0.06591436419306344,-0.05489554572168076,-0.049604205983174464,-0.07001985020332668,-0.05088357240074641,-0.0669468276677848,-0.056666998225644916,-0.06505339107275455,-0.0811154387642993,-0.06640579438946437,-0.07359807028251955,-0.0728892082001164,-0.07332088452512962,-0.07210299423062701,-0.0704903898490148,-0.06879646044682444,-0.04788096698255292,-0.0613672202506389,-0.058071929206956543,-0.058843888197473125,-0.061292761672169516,-0.05519009711613155,-0.06272495724997795,-0.05555611683333811,-0.05929033221306898,-0.052701832949401874,-0.05623606167173666,-0.05906752612983653,-0.05524515277851561,-0.060534835442688124,-0.05126023144319792,-0.08131648835420759,-0.05745758352993886,-0.0607606864960429,-0.06464617565454081,-0.06848540836301471,-0.06497527529964452,-0.05079395884030205,-0.06157358428694691,-0.06316847258326756,-0.047081910041015834,-0.05626309067727406,-0.05873175431942322,-0.06781415411651101,-0.08149201052768901,-0.07528757384467635,-0.05589257719192628,-0.06785910715687771,-0.07879334720934149,-0.0781240626514451,-0.06404643546662751,-0.05533679197164555,-0.038243571956494166,-0.06095242773834925,-0.07636248215725032,-0.0783846410917054,-0.0701649611225422,-0.06731772578004695,-0.05641601352418738,-0.05378620262228585,-0.05329428369607539,-0.050744105851633844,-0.050524174328424674,-0.05126023144319792,-0.05623606167173666,-0.058480845391330846,-0.055537872437584096,-0.05001065136382351,-0.05135889790480536,-0.054960112100508725,-0.05763377020563013,-0.050744105851633844,-0.045774675908428264,-0.023649665141222367,-0.02330608819756128,-0.02347850376728348,-0.02059634845315113,-0.029323664481685763,-0.025037349913157026,-0.026352234153452288,-0.022288887736205194,-0.019541576550079244,-0.050111752718391585,-0.05318012514264909,-0.048635823487217136,-0.04630196049147254,-0.052797834139533845,-0.061027295352120155,-0.053635317992251595,-0.06846313729414821,-0.04714632848889155,-0.05656852820543318,-0.05563814349114046,-0.060038395770054515,-0.06155710023948264,-0.05581091985943604,-0.05777433780397285,-0.08314147875238806,-0.06523270833251953,-0.06485792820324777,-0.07148643234874247,-0.07728445147095717,-0.0892262082943549,-0.041451208150186776,-0.039885782142671244,-0.04120667245040482,-0.04332322123283767,-0.03339463321861653,-0.060029946668838166,-0.05990306919929987,-0.05484014333544071,-0.04723208429506407,-0.04854210240769016,-0.06231906573571637,-0.05476618735124339,-0.04109615991008349,-0.02039963787951149,-0.03211580104496498,-0.034407098615429094,-0.019360204109754087,-0.04832271966853309,-0.06259535386227223,-0.04899860633090746,-0.05894720967314284,-0.06521713476898423,-0.05133917962296752,-0.05826368922573437,-0.05806319799781901,-0.05328477971912969,-0.049215004638923684,-0.050634258278670345,-0.03872915629377522,-0.05166357904181066,-0.06321667080644738,-0.051319453844544456,-0.043521126664431954,-0.06270877124583518,-0.0776463076283183,-0.057730446708430486,-0.07279138614028073,-0.08552781374734048,-0.05160474740425001,-0.04727490438184493,-0.05652371312117207,-0.062098798391654345,-0.04739246190933114,-0.04906053087963723,-0.05236445817240682,-0.056952500796523024,-0.07179181806491661,-0.0603165616834836,-0.04782810810919711,-0.04726420296466108,-0.05425504373496814,-0.054217683905990924,-0.05507062332355402,-0.05583815207357197,-0.04005010657168069,-0.04118213956354834,-0.03738884464893748,-0.04394880470804723,-0.04472422542215391,-0.04153645785047053,-0.03647272014589551,-0.03756402618200413,-0.00650415552775467,-0.008091480303737797,-0.007167262800044312,-0.007443041843058392,-0.009782270492464998,-0.009361290904884828,-0.007237192615872242,-0.010948549724190973,-0.007167262800044316,-0.05194700267581461,-0.06435488352578898,-0.05118116276264432,-0.05714796708548411,-0.06472468195076822,-0.0656363459794491,-0.06375244841506413,-0.057032544199060446,-0.06508461157230366,-0.010716042970003942,-0.007708963625401319,-0.00673245075525295,-0.006504155527754674,-0.007025315672895107,-0.00857500699223094,-0.009626560005977208,-0.008336748945290037,-0.008749444992674457,-0.05627209749712719,-0.05676529961472743,-0.06651281917807077,-0.06529496607144318,-0.047038915832643385,18.453709320483476,-0.042818584872792984,-0.053625873769107976,-0.05365420150862921,-0.04977734676562818,-0.06878167974186947,-0.04433819693910801,-0.05986071780543384,-0.06609907121769883,-0.06727997102788526,-0.05837670987621742,-0.05507982270439976,-0.05336076467502687,-0.04854210240769015,-0.060207132375688914,-0.0686781274154852,-0.08020464262351436,-0.07279837769746693,-0.06402264815154096,-0.20823561448448769,-0.2302397024494306,-0.2499949167660567,-0.21624315780533007,-0.25205784982469503,-0.2477403162861355,-0.26706362607916834,-0.22675030376435376,-0.2188357533812614,-0.2751003926660484,-0.22335885879824352,-0.0010035922642463817,-0.19304280194731924,-0.2155839824201144,-0.18840296298356615,-0.17134204629855904,-0.21302352843060687,-0.21784081228236407,-0.1282529133097483,-0.1837414476597514,5.922300861559004,-0.2463863995956608,-0.004374596915293934,-0.2010279673517801,-0.16421548890801718,-0.13115136713539394,-0.15721149086607336,-0.1855874453258493,-0.16756167554583531,-0.1680882521681892,-0.16221219802573383,-0.13484944433744797,-0.1163718789767292,-0.11259281675597634,-0.11478345990136347,-0.11923700101377,-0.16454186488839795,-0.16749177765409012,-0.18772838423409224,-0.130226576783506,-0.09339272032279695,-0.17839853590954266,-0.17850069471114482,-0.10948807373180561,-0.110084759836271,-0.213614467106868,-0.11928478098239717,-0.12462674539258176,-0.19147112573418992,-0.2278432024103328,-0.15698261439694278,-0.11845693408424188,-0.11874511969898482,-0.12911431608274618,-0.09780248086713268,-0.16700494874176955,-0.11157474636622977,-0.10156411014572639,-0.16517036424248302,-0.10399103197766399,-0.09762388814202753,-0.09998845336363148,-0.10731000315755534,-0.21864616505738954,-0.11367496512584743,-0.12644789895481653,-0.10297190703249584,-0.11650517153123542,-0.09882076481167935,-0.0954664455499649,8.688563760237734,-0.10101098898269098,8.36838291392527,-0.11118546386055791,-0.11925437763850066,-0.08352749996099652,-0.07424547100583231,-0.1495547902756992,-0.12299561671333308,-0.18835136688632917,-0.03120735966402134,-0.06754381911060058,-0.06582181968409022,-0.041560783010527576,-0.05792332206561842,-0.09509510030485391,-0.11280820350830101,-0.10643268158008763,-0.07796730773689395,-0.04712486537050993,-0.136902372359352,-0.07973265227158796,-0.09480891197208943,-0.14006402630617126,-0.18278148763257074,-0.16972207359895342,-0.2641256429109825,-0.1484706755617029,-0.14166292513533682,-0.1751904449843872,-0.18065811375459534,-0.13247217113726734,-0.09946823811448975,-0.16676903460078235,-0.07947638494441472,-0.06551241014151814,-0.09643932314378373,-0.04846908412630842,-0.05879216030962861,-0.11249645351748692,-0.07317495070870715,-0.10348018524979347,-0.07592088442704147,-0.12329484180517442,-0.18658977353193015,-0.15249550552833088,-0.07295900249448291,-0.13845187758800148,-0.03824357195649418,-0.07452612478470452,-0.16414755564019817,-0.06350510417507589,-0.08595099260609496,-0.07114401993312519,-0.12741055894010325,-0.06939981005448186,-0.047285603398740814,-0.08493433057553969,-0.061317591093945106,-0.10106185504683486,-0.063304940423112,-0.07961102555802549,-0.14047464443071914,-0.06376837369441234,-0.096704945998185,-0.06094410352146387,-0.08821718085510731,-0.1564428879086454,-0.1705869233629538,-0.16354800464597444,-0.16073131341560518,-0.09617832732302489,-0.17213145517129003,-0.05919616901756817,-0.11040298286073379,-0.046985118236143895,-0.08909422514505988,-0.0794828014439228,-0.09440792886473964,-0.5221616714586382,2.0659615926667056,-0.47357975066925556,-0.4633275303700751,-0.003886923530201251,-0.4057442139129375,2.4329975392082237,-0.4007882288446298,-0.4055270204137107,-0.40543271911199835,-0.4092074829809085,-0.3199893712232493,-0.2843562888131152,-0.2694516551488966,-0.2907227005243578,-0.3221879427641897,3.170079062775093,-0.29786916154832377,-0.31496488454163596,-0.2897885288587975,-0.3057416112662466,-0.30698465724837665,-0.1768084578175145,-0.1640504644390994,-0.14811953006616874,-0.1278873101774467,-0.11896300315008969,-0.1314173104977912,-0.15627725957418248,-0.18868079903207297,-0.21840076466086605,-0.21527426283261245,-0.21969354339350014,-0.24534757659169962,-0.22454920677813597,4.328183875123009,-0.241164734234802,-0.23527219332793364,-0.2345059375665664,-0.2446893583294156,-0.24865499834549556,-0.2428506315968776,-0.23647132533239898,-0.23581525387754995,-0.21016321359229923,11.046735235156053,-0.3196832649108735,-0.5705336271723215,-0.29219319897081053,-0.11716506459750017,-0.7604852696937126,-0.11002851188147625,-0.5254670986663884,-0.8283772935205359,1.482326457649362,-0.5820586003007778,1.6138574646776442,-0.5317534321116928,-0.07340414779614234,-0.2276132112168982,-0.8262011287394958,-0.18849752417622576,-0.1649321089340692,1.3733403693565496,-0.14078002255288116
)
# changing the input data into numpy array
input_array = np.asarray(input_data)

# reshaping the array as we are predicting for one instance
input_reshaped = input_array.reshape(1,-1)

prediction = classifier.predict(input_reshaped)
print(prediction)

if (prediction[0]==False):
  print('Person is not Arrested')
else:
  print('Persion is Arrested')

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
#Classification Metrics
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Precision = ',metrics.precision_score(y_test, y_pred,))
print('Recall = ',metrics.recall_score(y_test, y_pred))
print('F-1 Score = ',metrics.f1_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))

In [None]:
crimes_data_type = crimes_data.loc[crimes_data.primary_type_grouped.isin(['THEFT','NON-CRIMINAL_ASSAULT','CRIMINAL_OFFENSE'])]
crimes_data_prediction = crimes_data_type.drop(['date','block','iucr','primary_type','description','location_description','fbi_code','updated_on','x_coordinate','y_coordinate','primary_type_grouped'],axis=1)
crimes_data_prediction_type = crimes_data_type.primary_type_grouped
crimes_data_prediction = pd.get_dummies(crimes_data_prediction,drop_first=True)

In [None]:
crimes_data_prediction.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(crimes_data_prediction,crimes_data_prediction_type, test_size=0.3, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Decision tree classifier for type of crime
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)


In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
#Classification Metrics
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))

In [None]:
#Random Forest classifier for type of crime
model = RandomForestClassifier(n_estimators = 10,criterion='entropy',random_state=42)

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
#Classification Metrics
print('Accuracy = ',metrics.accuracy_score(y_test, y_pred))
print('Error = ',1 - metrics.accuracy_score(y_test, y_pred))
print('Classification Report\n',metrics.classification_report(y_test, y_pred))