## Importing the essential libraries over here


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.float_format', lambda x: "{:.6f}".format(x) if not pd.isna(x) else "NaN")

## Importing the dataset over here


In [2]:
data=pd.read_csv("MD_Crime_Data.csv")

In [3]:
data.head()

Unnamed: 0,JURISDICTION,YEAR,POPULATION,MURDER,RAPE,ROBBERY,AGG. ASSAULT,B & E,LARCENY THEFT,M/V THEFT,...,"B & E PER 100,000 PEOPLE","LARCENY THEFT PER 100,000 PEOPLE","M/V THEFT PER 100,000 PEOPLE","MURDER RATE PERCENT CHANGE PER 100,000 PEOPLE","RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE","ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE","AGG. ASSAULT RATE PERCENT CHANGE PER 100,000 PEOPLE","B & E RATE PERCENT CHANGE PER 100,000 PEOPLE","LARCENY THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE","M/V THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE"
0,Allegany County,1975,79655,3,5,20,114,669,1425,93,...,839.9,1789.0,116.8,,,,,,,
1,Allegany County,1976,83923,2,2,24,59,581,1384,73,...,692.3,1649.1,87.0,-36.7,-62.0,13.9,-50.9,-17.6,-7.8,-25.5
2,Allegany County,1977,82102,3,7,32,85,592,1390,102,...,721.1,1693.0,124.2,53.3,257.8,36.3,47.3,4.2,2.7,42.8
3,Allegany County,1978,79966,1,2,18,81,539,1390,100,...,674.0,1738.2,125.1,-65.8,-70.7,-42.2,-2.2,-6.5,2.7,0.7
4,Allegany County,1979,79721,1,7,18,84,502,1611,99,...,629.7,2020.8,124.2,0.3,251.1,0.3,4.0,-6.6,16.3,-0.7


In [4]:
data.shape

(1104, 38)

## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here


In [6]:
data.isnull().sum()

JURISDICTION                                              0
YEAR                                                      0
POPULATION                                                0
MURDER                                                    0
RAPE                                                      0
ROBBERY                                                   0
AGG. ASSAULT                                              0
B & E                                                     0
LARCENY THEFT                                             0
M/V THEFT                                                 0
GRAND TOTAL                                               0
PERCENT CHANGE                                           24
VIOLENT CRIME TOTAL                                       0
VIOLENT CRIME PERCENT                                     0
VIOLENT CRIME PERCENT CHANGE                             24
PROPERTY CRIME TOTALS                                     0
PROPERTY CRIME PERCENT                  

In [7]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

PERCENT CHANGE
VIOLENT CRIME PERCENT CHANGE
PROPERTY CRIME PERCENT CHANGE
OVERALL PERCENT CHANGE PER 100,000 PEOPLE
VIOLENT CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE
PROPERTY CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE
MURDER  RATE PERCENT CHANGE PER 100,000 PEOPLE
RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE
ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE
AGG. ASSAULT  RATE PERCENT CHANGE PER 100,000 PEOPLE
B & E RATE PERCENT CHANGE PER 100,000 PEOPLE
LARCENY THEFT  RATE PERCENT CHANGE PER 100,000 PEOPLE
M/V THEFT  RATE PERCENT CHANGE PER 100,000 PEOPLE


In [8]:
data[missing_values]

Unnamed: 0,PERCENT CHANGE,VIOLENT CRIME PERCENT CHANGE,PROPERTY CRIME PERCENT CHANGE,"OVERALL PERCENT CHANGE PER 100,000 PEOPLE","VIOLENT CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE","PROPERTY CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE","MURDER RATE PERCENT CHANGE PER 100,000 PEOPLE","RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE","ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE","AGG. ASSAULT RATE PERCENT CHANGE PER 100,000 PEOPLE","B & E RATE PERCENT CHANGE PER 100,000 PEOPLE","LARCENY THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE","M/V THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE"
0,,,,,,,,,,,,,
1,-8.800000,-38.700000,-6.800000,-13.400000,-41.800000,-11.600000,-36.700000,-62.000000,13.900000,-50.900000,-17.600000,-7.800000,-25.500000
2,4.000000,46.000000,2.300000,6.400000,49.200000,4.500000,53.300000,257.800000,36.300000,47.300000,4.200000,2.700000,42.800000
3,-3.600000,-19.700000,-2.600000,-1.000000,-17.500000,0.000000,-65.800000,-70.700000,-42.200000,-2.200000,-6.500000,2.700000,0.700000
4,9.000000,7.800000,9.000000,9.300000,8.200000,9.400000,0.300000,251.100000,0.300000,4.000000,-6.600000,16.300000,-0.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,-1.000000,4.800000,-1.500000,-0.400000,5.500000,-0.900000,201.800000,14.000000,45.300000,-8.300000,6.500000,-2.500000,15.000000
1100,-1.900000,16.400000,-3.400000,-2.200000,16.100000,-3.700000,-33.500000,52.500000,9.900000,13.600000,-24.100000,-0.300000,21.500000
1101,-20.500000,-29.900000,-19.500000,-20.300000,-29.800000,-19.400000,-100.000000,-53.800000,-44.100000,-16.800000,-2.100000,-22.000000,-12.600000
1102,-7.700000,4.800000,-8.800000,-8.200000,4.200000,-9.300000,0.000000,-0.600000,-0.600000,6.200000,-14.000000,-8.300000,-12.300000


In [9]:
data.dropna(inplace=True)

## Filtering only the numerical values over here


In [10]:
numeric_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numeric_features:
  print(feature)

YEAR
POPULATION
MURDER
RAPE
ROBBERY
AGG. ASSAULT
B & E
LARCENY THEFT
M/V THEFT
GRAND TOTAL
PERCENT CHANGE
VIOLENT CRIME TOTAL
VIOLENT CRIME PERCENT
VIOLENT CRIME PERCENT CHANGE
PROPERTY CRIME TOTALS
PROPERTY CRIME PERCENT
PROPERTY CRIME PERCENT CHANGE
OVERALL CRIME RATE PER 100,000 PEOPLE
OVERALL PERCENT CHANGE PER 100,000 PEOPLE
VIOLENT CRIME RATE PER 100,000 PEOPLE
VIOLENT CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE
PROPERTY CRIME RATE PER 100,000 PEOPLE
PROPERTY CRIME RATE PERCENT CHANGE PER 100,000 PEOPLE
MURDER PER 100,000 PEOPLE
RAPE PER 100,000 PEOPLE
ROBBERY PER 100,000 PEOPLE
AGG. ASSAULT PER 100,000 PEOPLE
B & E PER 100,000 PEOPLE
LARCENY THEFT PER 100,000 PEOPLE
M/V THEFT PER 100,000 PEOPLE
MURDER  RATE PERCENT CHANGE PER 100,000 PEOPLE
RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE
ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE
AGG. ASSAULT  RATE PERCENT CHANGE PER 100,000 PEOPLE
B & E RATE PERCENT CHANGE PER 100,000 PEOPLE
LARCENY THEFT  RATE PERCENT CHANGE PER 100,000 PEOPL

In [11]:
data[numeric_features]

Unnamed: 0,YEAR,POPULATION,MURDER,RAPE,ROBBERY,AGG. ASSAULT,B & E,LARCENY THEFT,M/V THEFT,GRAND TOTAL,...,"B & E PER 100,000 PEOPLE","LARCENY THEFT PER 100,000 PEOPLE","M/V THEFT PER 100,000 PEOPLE","MURDER RATE PERCENT CHANGE PER 100,000 PEOPLE","RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE","ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE","AGG. ASSAULT RATE PERCENT CHANGE PER 100,000 PEOPLE","B & E RATE PERCENT CHANGE PER 100,000 PEOPLE","LARCENY THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE","M/V THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE"
1,1976,83923,2,2,24,59,581,1384,73,2125,...,692.300000,1649.100000,87.000000,-36.700000,-62.000000,13.900000,-50.900000,-17.600000,-7.800000,-25.500000
2,1977,82102,3,7,32,85,592,1390,102,2211,...,721.100000,1693.000000,124.200000,53.300000,257.800000,36.300000,47.300000,4.200000,2.700000,42.800000
3,1978,79966,1,2,18,81,539,1390,100,2131,...,674.000000,1738.200000,125.100000,-65.800000,-70.700000,-42.200000,-2.200000,-6.500000,2.700000,0.700000
4,1979,79721,1,7,18,84,502,1611,99,2322,...,629.700000,2020.800000,124.200000,0.300000,251.100000,0.300000,4.000000,-6.600000,16.300000,-0.700000
5,1980,80461,2,12,26,79,541,1706,108,2474,...,672.400000,2120.300000,134.200000,98.200000,69.900000,43.100000,-6.800000,6.800000,4.900000,8.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,2016,51255,3,17,39,93,289,1514,32,1987,...,563.800000,2953.900000,62.400000,201.800000,14.000000,45.300000,-8.300000,6.500000,-2.500000,15.000000
1100,2017,51408,2,26,43,106,220,1514,39,1950,...,427.900000,2945.100000,75.900000,-33.500000,52.500000,9.900000,13.600000,-24.100000,-0.300000,21.500000
1101,2018,51304,0,12,24,88,215,1178,34,1551,...,419.100000,2296.100000,66.300000,-100.000000,-53.800000,-44.100000,-16.800000,-2.100000,-22.000000,-12.600000
1102,2019,51606,0,12,24,94,186,1086,30,1432,...,360.400000,2104.400000,58.100000,0.000000,-0.600000,-0.600000,6.200000,-14.000000,-8.300000,-12.300000


## Filtering only the categorical features over here


In [12]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

JURISDICTION


In [13]:
data[cat_features]

Unnamed: 0,JURISDICTION
1,Allegany County
2,Allegany County
3,Allegany County
4,Allegany County
5,Allegany County
...,...
1099,Worcester County
1100,Worcester County
1101,Worcester County
1102,Worcester County


## Encoding the categorical features into numerical feature over here

In [14]:
for feature in cat_features:
  feature_mapping={category:index for index,category in enumerate(data[feature].unique())}
  data[feature]=data[feature].map(feature_mapping)

In [16]:
data

Unnamed: 0,JURISDICTION,YEAR,POPULATION,MURDER,RAPE,ROBBERY,AGG. ASSAULT,B & E,LARCENY THEFT,M/V THEFT,...,"B & E PER 100,000 PEOPLE","LARCENY THEFT PER 100,000 PEOPLE","M/V THEFT PER 100,000 PEOPLE","MURDER RATE PERCENT CHANGE PER 100,000 PEOPLE","RAPE RATE PERCENT CHANGE PER 100,000 PEOPLE","ROBBERY RATE PERCENT CHANGE PER 100,000 PEOPLE","AGG. ASSAULT RATE PERCENT CHANGE PER 100,000 PEOPLE","B & E RATE PERCENT CHANGE PER 100,000 PEOPLE","LARCENY THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE","M/V THEFT RATE PERCENT CHANGE PER 100,000 PEOPLE"
1,0,1976,83923,2,2,24,59,581,1384,73,...,692.300000,1649.100000,87.000000,-36.700000,-62.000000,13.900000,-50.900000,-17.600000,-7.800000,-25.500000
2,0,1977,82102,3,7,32,85,592,1390,102,...,721.100000,1693.000000,124.200000,53.300000,257.800000,36.300000,47.300000,4.200000,2.700000,42.800000
3,0,1978,79966,1,2,18,81,539,1390,100,...,674.000000,1738.200000,125.100000,-65.800000,-70.700000,-42.200000,-2.200000,-6.500000,2.700000,0.700000
4,0,1979,79721,1,7,18,84,502,1611,99,...,629.700000,2020.800000,124.200000,0.300000,251.100000,0.300000,4.000000,-6.600000,16.300000,-0.700000
5,0,1980,80461,2,12,26,79,541,1706,108,...,672.400000,2120.300000,134.200000,98.200000,69.900000,43.100000,-6.800000,6.800000,4.900000,8.100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,23,2016,51255,3,17,39,93,289,1514,32,...,563.800000,2953.900000,62.400000,201.800000,14.000000,45.300000,-8.300000,6.500000,-2.500000,15.000000
1100,23,2017,51408,2,26,43,106,220,1514,39,...,427.900000,2945.100000,75.900000,-33.500000,52.500000,9.900000,13.600000,-24.100000,-0.300000,21.500000
1101,23,2018,51304,0,12,24,88,215,1178,34,...,419.100000,2296.100000,66.300000,-100.000000,-53.800000,-44.100000,-16.800000,-2.100000,-22.000000,-12.600000
1102,23,2019,51606,0,12,24,94,186,1086,30,...,360.400000,2104.400000,58.100000,0.000000,-0.600000,-0.600000,6.200000,-14.000000,-8.300000,-12.300000


In [18]:
data['Robbery']=data['ROBBERY']
data.drop(['ROBBERY'],axis=1,inplace=True)

In [19]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here

In [20]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training set over here


In [21]:
from sklearn.ensemble import RandomForestRegressor
regressor=RandomForestRegressor()
regressor.fit(X_train,y_train)

In [22]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[1.00e+01 1.35e+01]
 [1.80e+01 1.72e+01]
 [9.03e+02 9.16e+02]
 [2.00e+00 2.19e+00]
 [6.06e+02 6.80e+02]
 [7.10e+01 6.81e+01]
 [3.90e+01 3.57e+01]
 [6.00e+00 6.13e+00]
 [1.23e+02 1.35e+02]
 [1.07e+02 1.13e+02]
 [2.50e+01 2.54e+01]
 [1.30e+02 1.36e+02]
 [2.00e+00 2.10e+00]
 [1.32e+02 1.61e+02]
 [4.00e+00 4.82e+00]
 [2.00e+00 2.22e+00]
 [8.60e+01 8.26e+01]
 [8.70e+01 1.14e+02]
 [1.17e+02 1.16e+02]
 [3.00e+00 3.35e+00]
 [1.90e+01 2.04e+01]
 [9.10e+01 1.09e+02]
 [1.40e+01 1.61e+01]
 [1.10e+01 1.04e+01]
 [3.50e+01 3.75e+01]
 [3.10e+01 3.33e+01]
 [4.90e+01 3.87e+01]
 [2.20e+01 2.31e+01]
 [2.00e+00 2.79e+00]
 [9.00e+00 9.03e+00]
 [3.30e+01 3.41e+01]
 [2.60e+01 3.41e+01]
 [1.30e+01 1.11e+01]
 [2.02e+02 1.98e+02]
 [4.20e+01 3.75e+01]
 [3.00e+00 3.70e+00]
 [4.06e+03 4.13e+03]
 [1.50e+01 1.67e+01]
 [1.50e+01 1.47e+01]
 [7.60e+01 7.81e+01]
 [8.00e+00 1.18e+01]
 [1.90e+01 2.19e+01]
 [1.24e+03 1.46e+03]
 [8.00e+00 9.94e+00]
 [1.70e+01 1.77e+01]
 [2.30e+01 2.47e+01]
 [1.10e+02 9.77e+01]
 [3.50e+01 2.

In [23]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9927742698397308

In [24]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [25]:
mse

12961.677343981484

In [26]:
rmse

113.84936251021121

In [27]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

     Actual   Predicted
0        10   13.510000
1        18   17.230000
2       903  915.990000
3         2    2.190000
4       606  680.210000
..      ...         ...
211      36   33.790000
212    7485 7256.380000
213      29   28.570000
214      72   76.580000
215      12   12.260000

[216 rows x 2 columns]


In [28]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [29]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,10,13.510000,3.510000
1,18,17.230000,0.770000
2,903,915.990000,12.990000
3,2,2.190000,0.190000
4,606,680.210000,74.210000
...,...,...,...
211,36,33.790000,2.210000
212,7485,7256.380000,228.620000
213,29,28.570000,0.430000
214,72,76.580000,4.580000
