In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import joblib

In [3]:
df = pd.read_csv("crime_dataset_india.csv")

In [10]:
df.head(10)

Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,City,Crime Code,Crime Description,Victim Age,Victim Gender,Weapon Used,Crime Domain,Police Deployed,Case Closed,Date Case Closed
0,1,02-01-2020 00:00,01-01-2020 00:00,01-01-2020 01:11,Ahmedabad,576,IDENTITY THEFT,16,M,Blunt Object,Violent Crime,13,No,
1,2,01-01-2020 19:00,01-01-2020 01:00,01-01-2020 06:26,Chennai,128,HOMICIDE,37,M,Poison,Other Crime,9,No,
2,3,02-01-2020 05:00,01-01-2020 02:00,01-01-2020 14:30,Ludhiana,271,KIDNAPPING,48,F,Blunt Object,Other Crime,15,No,
3,4,01-01-2020 05:00,01-01-2020 03:00,01-01-2020 14:46,Pune,170,BURGLARY,49,F,Firearm,Other Crime,1,Yes,29-04-2020 05:00
4,5,01-01-2020 21:00,01-01-2020 04:00,01-01-2020 16:51,Pune,421,VANDALISM,30,F,Other,Other Crime,18,Yes,08-01-2020 21:00
5,6,02-01-2020 03:00,01-01-2020 05:00,01-01-2020 17:09,Delhi,442,ASSAULT,16,M,Firearm,Violent Crime,18,Yes,30-03-2020 03:00
6,7,01-01-2020 16:00,01-01-2020 06:00,01-01-2020 14:08,Chennai,172,VEHICLE - STOLEN,64,F,Knife,Violent Crime,13,Yes,24-03-2020 16:00
7,8,02-01-2020 10:00,01-01-2020 07:00,02-01-2020 06:33,Chennai,169,COUNTERFEITING,78,X,Knife,Other Crime,8,No,
8,9,04-01-2020 03:00,01-01-2020 08:00,02-01-2020 06:34,Mumbai,338,EXTORTION,41,X,Blunt Object,Other Crime,1,No,
9,10,03-01-2020 07:00,01-01-2020 09:00,01-01-2020 17:50,Chennai,497,PUBLIC INTOXICATION,29,M,Knife,Other Crime,4,No,


In [5]:
df.shape

(40160, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40160 entries, 0 to 40159
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Report Number       40160 non-null  int64 
 1   Date Reported       40160 non-null  object
 2   Date of Occurrence  40160 non-null  object
 3   Time of Occurrence  40160 non-null  object
 4   City                40160 non-null  object
 5   Crime Code          40160 non-null  int64 
 6   Crime Description   40160 non-null  object
 7   Victim Age          40160 non-null  int64 
 8   Victim Gender       40160 non-null  object
 9   Weapon Used         34370 non-null  object
 10  Crime Domain        40160 non-null  object
 11  Police Deployed     40160 non-null  int64 
 12  Case Closed         40160 non-null  object
 13  Date Case Closed    20062 non-null  object
dtypes: int64(4), object(10)
memory usage: 4.3+ MB


In [7]:
df.describe()

Unnamed: 0,Report Number,Crime Code,Victim Age,Police Deployed
count,40160.0,40160.0,40160.0,40160.0
mean,20080.5,349.360259,44.49126,10.00625
std,11593.337742,144.169205,20.22555,5.467951
min,1.0,100.0,10.0,1.0
25%,10040.75,225.0,27.0,5.0
50%,20080.5,349.0,44.0,10.0
75%,30120.25,474.0,62.0,15.0
max,40160.0,599.0,79.0,19.0


In [13]:
df.columns

Index(['Report Number', 'Date Reported', 'Date of Occurrence',
       'Time of Occurrence', 'City', 'Crime Code', 'Crime Description',
       'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain',
       'Police Deployed', 'Case Closed', 'Date Case Closed'],
      dtype='object')

In [15]:
df['Date of Occurrence'] = pd.to_datetime(df['Date of Occurrence'])
df['Year'] = df['Date of Occurrence'].dt.year
df['Month'] = df['Date of Occurrence'].dt.month

In [16]:
df.head()

Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,City,Crime Code,Crime Description,Victim Age,Victim Gender,Weapon Used,Crime Domain,Police Deployed,Case Closed,Date Case Closed,Date of Occurence,Year,Month
0,1,02-01-2020 00:00,2020-01-01 00:00:00,01-01-2020 01:11,Ahmedabad,576,IDENTITY THEFT,16,M,Blunt Object,Violent Crime,13,No,,2020-01-01 00:00:00,2020,1
1,2,01-01-2020 19:00,2020-01-01 01:00:00,01-01-2020 06:26,Chennai,128,HOMICIDE,37,M,Poison,Other Crime,9,No,,2020-01-01 01:00:00,2020,1
2,3,02-01-2020 05:00,2020-01-01 02:00:00,01-01-2020 14:30,Ludhiana,271,KIDNAPPING,48,F,Blunt Object,Other Crime,15,No,,2020-01-01 02:00:00,2020,1
3,4,01-01-2020 05:00,2020-01-01 03:00:00,01-01-2020 14:46,Pune,170,BURGLARY,49,F,Firearm,Other Crime,1,Yes,29-04-2020 05:00,2020-01-01 03:00:00,2020,1
4,5,01-01-2020 21:00,2020-01-01 04:00:00,01-01-2020 16:51,Pune,421,VANDALISM,30,F,Other,Other Crime,18,Yes,08-01-2020 21:00,2020-01-01 04:00:00,2020,1


In [17]:
df[['Date of Occurrence', 'Year', 'Month']].head()

Unnamed: 0,Date of Occurrence,Year,Month
0,2020-01-01 00:00:00,2020,1
1,2020-01-01 01:00:00,2020,1
2,2020-01-01 02:00:00,2020,1
3,2020-01-01 03:00:00,2020,1
4,2020-01-01 04:00:00,2020,1


In [18]:
crime_rate_df = (
    df.groupby(['City', 'Year', 'Month']).size().reset_index(name='Crime_Rate')
)

crime_rate_df.head()

Unnamed: 0,City,Year,Month,Crime_Rate
0,Agra,2020,1,11
1,Agra,2020,2,7
2,Agra,2020,3,19
3,Agra,2020,4,9
4,Agra,2020,5,20


In [19]:
x = crime_rate_df[['City', 'Year', 'Month']]
y = crime_rate_df['Crime_Rate']

In [21]:
x = pd.get_dummies(x, drop_first=True)
x.head()

Unnamed: 0,Year,Month,City_Ahmedabad,City_Bangalore,City_Bhopal,City_Chennai,City_Delhi,City_Faridabad,City_Ghaziabad,City_Hyderabad,...,City_Nashik,City_Patna,City_Pune,City_Rajkot,City_Srinagar,City_Surat,City_Thane,City_Varanasi,City_Vasai,City_Visakhapatnam
0,2020,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2020,2,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2020,3,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,2020,4,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,2020,5,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state = 42
)

In [23]:
model = LinearRegression()
model.fit(x_train, y_train)

In [24]:
y_pred = model.predict(x_test)

In [28]:
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score: ", r2_score(y_test, y_pred))

MAE:  3.466365691026128
MSE:  22.539233564547565
RMSE:  4.747550269828384
R2 Score:  0.9614936772082815


In [31]:
future_data = pd.DataFrame({
    'City': ['Agra'],
    'Year': [2025],
    'Month': [4]
})

future_data = pd.get_dummies(future_data)
future_data = future_data.reindex(columns=x.columns, fill_value=0)

future_prediction = model.predict(future_data)
print("Predicted Crime Rate:", int(future_prediction[0]))

Predicted Crime Rate: 13


In [32]:
joblib.dump(model, "model.pkl")

['model.pkl']