## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [2]:
data=pd.read_csv("USCrime.csv")

In [3]:
data.head()

Unnamed: 0,year,State,population,violent_crime,homicide,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,Total
0,2016,All,323405935,1285606,17413,132414.0,332797,802982,7928530,1516405,5644835,767290,18428272.0
1,2016,AK,741522,5966,52,1053.0,850,4011,24876,4053,17766,3057,61684.0
2,2016,AL,4860545,25878,407,1915.0,4687,18869,143259,34045,97498,11716,338274.0
3,2016,AR,2988231,16563,217,2214.0,2125,12007,98092,23814,67091,7187,229310.0
4,2016,AZ,6908642,32542,389,3304.0,7045,21804,207317,38216,150618,18483,479718.0


In [4]:
data.shape

(156, 13)

## Taking care of missing values if present over here

In [5]:
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>1]
for feature in missing_values:
  print(feature)

In [6]:
data.isnull().sum()

year                   0
State                  0
population             0
violent_crime          0
homicide               0
rape_revised           0
robbery                0
aggravated_assault     0
property_crime         0
burglary               0
larceny                0
motor_vehicle_theft    0
Total                  0
dtype: int64

## Filtering only the numerical features over here from the entire dataset over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!="O"]
for feature in numerical_features:
  print(feature)

year
population
violent_crime
homicide
rape_revised
robbery
aggravated_assault
property_crime
burglary
larceny
motor_vehicle_theft
Total


In [8]:
data[numerical_features]

Unnamed: 0,year,population,violent_crime,homicide,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,Total
0,2016,323405935,1285606,17413,132414.0,332797,802982,7928530,1516405,5644835,767290,18428272.0
1,2016,741522,5966,52,1053.0,850,4011,24876,4053,17766,3057,61684.0
2,2016,4860545,25878,407,1915.0,4687,18869,143259,34045,97498,11716,338274.0
3,2016,2988231,16563,217,2214.0,2125,12007,98092,23814,67091,7187,229310.0
4,2016,6908642,32542,389,3304.0,7045,21804,207317,38216,150618,18483,479718.0
...,...,...,...,...,...,...,...,...,...,...,...,...
151,2018,7535591,23472,236,3413.0,5572,14251,222011,40201,154133,27677,490966.0
152,2018,5813568,17176,176,2248.0,3489,11263,90686,14099,67953,8634,215724.0
153,2018,1805832,5236,67,652.0,572,3945,26827,5354,18954,2519,64126.0
154,2018,577737,1226,13,243.0,100,870,10313,1525,7949,839,23078.0


In the above dataframe we will have to treat year as a categorical column over here

## Filtering the categorical features over here

In [9]:
cat_features=[feature for feature in data.columns if data[feature].dtype=="O"]
for feature in cat_features:
  print(feature)


State


## Encoding the categorical features over here

In [10]:
data['State'].value_counts()

State
All    3
AK     3
NC     3
ND     3
NE     3
NH     3
NJ     3
NM     3
NV     3
NY     3
OH     3
OK     3
OR     3
PA     3
RI     3
SC     3
SD     3
TN     3
TX     3
UT     3
VA     3
VT     3
WA     3
WI     3
WV     3
MT     3
MS     3
MO     3
HI     3
AL     3
AR     3
AZ     3
CA     3
CO     3
CT     3
DC     3
DE     3
FL     3
GA     3
IA     3
MN     3
ID     3
IL     3
IN     3
KS     3
KY     3
LA     3
MA     3
MD     3
ME     3
MI     3
WY     3
Name: count, dtype: int64

In [11]:
State_mapping = {}

# Iterate over the list of states and assign sequential numbers
for index, state in enumerate(["All", "AK", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "MT", "MS", "MO", "HI", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "IA", "MN", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "WY"]):
    State_mapping[state] = index

# Print the dictionary to verify
print(State_mapping)

{'All': 0, 'AK': 1, 'NC': 2, 'ND': 3, 'NE': 4, 'NH': 5, 'NJ': 6, 'NM': 7, 'NV': 8, 'NY': 9, 'OH': 10, 'OK': 11, 'OR': 12, 'PA': 13, 'RI': 14, 'SC': 15, 'SD': 16, 'TN': 17, 'TX': 18, 'UT': 19, 'VA': 20, 'VT': 21, 'WA': 22, 'WI': 23, 'WV': 24, 'MT': 25, 'MS': 26, 'MO': 27, 'HI': 28, 'AL': 29, 'AR': 30, 'AZ': 31, 'CA': 32, 'CO': 33, 'CT': 34, 'DC': 35, 'DE': 36, 'FL': 37, 'GA': 38, 'IA': 39, 'MN': 40, 'ID': 41, 'IL': 42, 'IN': 43, 'KS': 44, 'KY': 45, 'LA': 46, 'MA': 47, 'MD': 48, 'ME': 49, 'MI': 50, 'WY': 51}


In [12]:
data['State']=data["State"].map(State_mapping)

In [13]:
data['year'].value_counts()

year
2016    52
2017    52
2018    52
Name: count, dtype: int64

In [14]:
year_mapping={2016:0,2017:1,2018:2}
data['year']=data['year'].map(year_mapping)

In [15]:
data

Unnamed: 0,year,State,population,violent_crime,homicide,rape_revised,robbery,aggravated_assault,property_crime,burglary,larceny,motor_vehicle_theft,Total
0,0,0,323405935,1285606,17413,132414.0,332797,802982,7928530,1516405,5644835,767290,18428272.0
1,0,1,741522,5966,52,1053.0,850,4011,24876,4053,17766,3057,61684.0
2,0,29,4860545,25878,407,1915.0,4687,18869,143259,34045,97498,11716,338274.0
3,0,30,2988231,16563,217,2214.0,2125,12007,98092,23814,67091,7187,229310.0
4,0,31,6908642,32542,389,3304.0,7045,21804,207317,38216,150618,18483,479718.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,2,22,7535591,23472,236,3413.0,5572,14251,222011,40201,154133,27677,490966.0
152,2,23,5813568,17176,176,2248.0,3489,11263,90686,14099,67953,8634,215724.0
153,2,24,1805832,5236,67,652.0,572,3945,26827,5354,18954,2519,64126.0
154,2,51,577737,1226,13,243.0,100,870,10313,1525,7949,839,23078.0


## Creating the features and labels over here

In [16]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set over here to avoid the problem of overfitting

In [17]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

## Training the model on the training set over here

In [18]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso

In [19]:
regressor=XGBRegressor()
regressor.fit(X_train,y_train)

## Evaluating the performance of the model on the testing dataset over here

In [20]:
y_pred=regressor.predict((X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1),y_pred.reshape(len(y_pred),1)),1))

[[ 181864.    194090.5 ]
 [ 222830.    218705.19]
 [ 271730.    263379.75]
 [  53156.     53706.56]
 [ 262176.    264982.  ]
 [ 146088.    145268.62]
 [ 195934.    200523.62]
 [ 452064.    456056.62]
 [ 180810.    159784.56]
 [  42878.     41801.93]
 [ 500692.    462488.  ]
 [  25782.     23170.88]
 [2237200.   2075378.12]
 [ 342838.    340880.28]
 [2331320.   2213548.25]
 [1594714.   1546936.25]
 [ 193030.    172922.22]
 [ 595508.    534104.31]
 [1781912.   1658186.38]
 [ 385862.    370015.94]
 [  37642.     39606.85]
 [  81226.     49830.09]
 [ 574438.    532322.12]
 [  73424.     50117.25]
 [  75440.     51333.  ]
 [ 215724.    194516.56]
 [ 189966.    203334.5 ]
 [ 669538.    667467.12]
 [ 321380.    322427.  ]
 [ 483936.    491022.16]
 [ 389952.    398998.94]
 [ 361976.    353185.28]
 [  35280.     37781.11]
 [ 340652.    337480.84]
 [  47114.     43748.  ]
 [  37572.     36744.88]
 [  78248.     63304.07]
 [  23772.     23201.  ]
 [ 371560.    369172.69]
 [  64084.     61523.48]


In [22]:
actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

       Actual     Predicted
0    181864.0  1.940905e+05
1    222830.0  2.187052e+05
2    271730.0  2.633798e+05
3     53156.0  5.370656e+04
4    262176.0  2.649820e+05
5    146088.0  1.452686e+05
6    195934.0  2.005236e+05
7    452064.0  4.560566e+05
8    180810.0  1.597846e+05
9     42878.0  4.180193e+04
10   500692.0  4.624880e+05
11    25782.0  2.317088e+04
12  2237200.0  2.075378e+06
13   342838.0  3.408803e+05
14  2331320.0  2.213548e+06
15  1594714.0  1.546936e+06
16   193030.0  1.729222e+05
17   595508.0  5.341043e+05
18  1781912.0  1.658186e+06
19   385862.0  3.700159e+05
20    37642.0  3.960685e+04
21    81226.0  4.983009e+04
22   574438.0  5.323221e+05
23    73424.0  5.011725e+04
24    75440.0  5.133300e+04
25   215724.0  1.945166e+05
26   189966.0  2.033345e+05
27   669538.0  6.674671e+05
28   321380.0  3.224270e+05
29   483936.0  4.910222e+05
30   389952.0  3.989989e+05
31   361976.0  3.531853e+05
32    35280.0  3.778111e+04
33   340652.0  3.374808e+05
34    47114.0  4.374

In [23]:
actual_vs_predicted['Absolute Difference'] = abs(actual_vs_predicted['Actual'] - actual_vs_predicted['Predicted'])

In [24]:
actual_vs_predicted

Unnamed: 0,Actual,Predicted,Absolute Difference
0,181864.0,194090.5,12226.5
1,222830.0,218705.2,4124.8125
2,271730.0,263379.8,8350.25
3,53156.0,53706.56,550.5625
4,262176.0,264982.0,2806.0
5,146088.0,145268.6,819.375
6,195934.0,200523.6,4589.625
7,452064.0,456056.6,3992.625
8,180810.0,159784.6,21025.4375
9,42878.0,41801.93,1076.074219


## Finding the r2 sqaure over here

In [21]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.9938216783302731