In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [6]:
data = 'crime_dataset_india.csv'
data = pd.read_csv(data)


In [20]:
# Convert 'Date of Occurrence' to datetime, infer the format automatically
data['Date of Occurrence'] = pd.to_datetime(data['Date of Occurrence'], errors='coerce')

# Check for any rows where conversion failed (NaT indicates a failure)
print(data['Date of Occurrence'].isna().sum())


# Extract year and month for further analysis
data['Year'] = data['Date of Occurrence'].dt.year
data['Month'] = data['Date of Occurrence'].dt.month


# Group by City and Year to find crime counts per city per year
city_year_data = data.groupby(['City', 'Year']).size().reset_index(name='Crime Count')

# Display the aggregated data
city_year_data.head()

0


Unnamed: 0,City,Year,Crime Count
0,Agra,2020,178
1,Agra,2021,155
2,Agra,2022,166
3,Agra,2023,162
4,Agra,2024,103


In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Encode city names using LabelEncoder
le_city = LabelEncoder()
city_year_data['City'] = le_city.fit_transform(city_year_data['City'])

# Define features (City, Year) and target (Crime Count)
X = city_year_data[['City', 'Year']]
y = city_year_data['Crime Count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)

# Predict crime count for the test set
y_pred = linear_reg_model.predict(X_test)



In [18]:
# Create a dataframe with future years (e.g., 2025) for each city
future_years = pd.DataFrame({
    'City': city_year_data['City'].unique(),
    'Year': [2025] * len(city_year_data['City'].unique())  # Predict for the year 2025
})

# Predict crime count for future years
future_predictions = linear_reg_model.predict(future_years)

# Map back to original city names
future_years['City'] = le_city.inverse_transform(future_years['City'])
future_years['Predicted Crime Count'] = future_predictions

# Display future crime rate predictions city-wise
future_years


Unnamed: 0,City,Year,Predicted Crime Count
0,Agra,2025,326.209539
1,Ahmedabad,2025,316.321392
2,Bangalore,2025,306.433244
3,Bhopal,2025,296.545097
4,Chennai,2025,286.656949
5,Delhi,2025,276.768802
6,Faridabad,2025,266.880654
7,Ghaziabad,2025,256.992507
8,Hyderabad,2025,247.10436
9,Indore,2025,237.216212


In [38]:
import pandas as pd

# Group data by City and Year, and count the number of crimes
city_year_data = data.groupby(['City', 'Year']).size().reset_index(name='Crime Count')

# Convert categorical variable 'City' to numerical values using one-hot encoding
city_year_data = pd.get_dummies(city_year_data, columns=['City'], drop_first=True)

# Features (City, Year) and target (Crime Count)
X = city_year_data.drop('Crime Count', axis=1)  # Features
y = city_year_data['Crime Count']  # Target

# Train-test split (80% train, 20% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Train the Linear Regression model
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

# Predict
y_pred_lr = linear_reg.predict(X_test)

# Evaluate the model
print("Linear Regression:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_lr)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_lr)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr))}")
print(f"R-squared: {r2_score(y_test, y_pred_lr)}")


Linear Regression:
MAE: 52.61107544073984
MSE: 4299.893596093998
RMSE: 65.57357391582373
R-squared: 0.942102951445407


In [40]:
from sklearn.ensemble import RandomForestRegressor

# Train the Random Forest Regressor model
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predict
y_pred_rf = rf_regressor.predict(X_test)

# Evaluate the model
print("Random Forest Regressor:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_rf)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf))}")
print(f"R-squared: {r2_score(y_test, y_pred_rf)}")


Random Forest Regressor:
MAE: 32.71379310344827
MSE: 2475.8196896551713
RMSE: 49.7576093643492
R-squared: 0.9666636744419457


In [41]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Standardize the data (SVR requires scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVR model
svr = SVR(kernel='rbf')  # Radial basis function kernel
svr.fit(X_train_scaled, y_train)

# Predict
y_pred_svr = svr.predict(X_test_scaled)

# Evaluate the model
print("Support Vector Regressor (SVR):")
print(f"MAE: {mean_absolute_error(y_test, y_pred_svr)}")
print(f"MSE: {mean_squared_error(y_test, y_pred_svr)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_svr))}")
print(f"R-squared: {r2_score(y_test, y_pred_svr)}")


Support Vector Regressor (SVR):
MAE: 167.5838409488424
MSE: 85573.9804833384
RMSE: 292.53030694842266
R-squared: -0.15223337329887876


In [45]:
# Group by City and Year to count the number of crimes
city_year_data = data.groupby(['City', 'Year']).size().reset_index(name='Crime Count')

# One-hot encode the 'City' column
city_year_data_encoded = pd.get_dummies(city_year_data, columns=['City'], drop_first=True)

# Features and target variable
X = city_year_data_encoded.drop('Crime Count', axis=1)  # Features
y = city_year_data_encoded['Crime Count']  # Target variable


In [46]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
# Train the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)


In [48]:
# Make predictions on the test data
y_pred_rf = rf_regressor.predict(X_test)

# Evaluate the model
print("Random Forest Regressor Performance:")
print(f"MAE: {mean_absolute_error(y_test, y_pred_rf)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf))}")
print(f"R-squared: {r2_score(y_test, y_pred_rf)}")


Random Forest Regressor Performance:
MAE: 32.71379310344827
RMSE: 49.7576093643492
R-squared: 0.9666636744419457


In [49]:
# Create a DataFrame for future predictions (e.g., year 2025)
unique_cities = city_year_data['City'].unique()
future_years = pd.DataFrame({
    'City': unique_cities,
    'Year': [2025] * len(unique_cities)  
})

# One-hot encode the 'City' column for future predictions
future_years_encoded = pd.get_dummies(future_years, columns=['City'], drop_first=True)

# Ensure the future data has the same columns as the training data
missing_cols = set(X.columns) - set(future_years_encoded.columns)
for col in missing_cols:
    future_years_encoded[col] = 0  # Add missing dummy columns with 0

# Ensure column order matches training data
future_years_encoded = future_years_encoded[X.columns]

# Predict crime count for future years using the Random Forest model
future_predictions = rf_regressor.predict(future_years_encoded)

# Add predictions to the DataFrame
future_years['Predicted Crime Count'] = future_predictions

# Display future crime rate predictions city-wise
print(future_years)


             City  Year  Predicted Crime Count
0            Agra  2025                  89.96
1       Ahmedabad  2025                 270.06
2       Bangalore  2025                 540.49
3          Bhopal  2025                  91.13
4         Chennai  2025                 382.60
5           Delhi  2025                 833.87
6       Faridabad  2025                  66.59
7       Ghaziabad  2025                  88.41
8       Hyderabad  2025                 410.99
9          Indore  2025                  91.59
10         Jaipur  2025                 217.38
11         Kalyan  2025                  54.85
12         Kanpur  2025                 174.77
13        Kolkata  2025                 525.56
14        Lucknow  2025                 201.55
15       Ludhiana  2025                  97.98
16         Meerut  2025                  88.43
17         Mumbai  2025                 696.13
18         Nagpur  2025                 197.38
19         Nashik  2025                  63.18
20          P