# Crime Statistics Resampling Techniques

### Import Dependencies

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

### Read the CSV and Perform Basic Data Cleaning

In [3]:
# Load the data
df = pd.read_csv('../Resources/CSV/crime_data.csv')

In [4]:
# Drop the year column to not skew data with year count and averages
df = df.drop(columns =['Year'])
df.head()

Unnamed: 0,County,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Anderson County,3,9,38,195,319,749,75,242,1143,1407,57560
1,Anderson County,3,2,44,237,259,485,94,283,838,1145,57250
2,Anderson County,3,2,25,153,278,536,64,180,878,1078,57569
3,Anderson County,3,1,8,96,198,531,62,105,791,907,57491
4,Anderson County,3,2,23,100,249,514,82,125,845,984,57657


In [5]:
# Convert the target column values to violent and nonviolent based on their crime
x = dict.fromkeys(['Murder', 'Rape', 'Assault'], 'violent_crimes')    
df = df.replace(x)

x = dict.fromkeys(['Burglary', 'Larceny', 'Auto_Theft'], 'nonviolent_crimes')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,County,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Anderson County,3,9,38,195,319,749,75,242,1143,1407,57560
1,Anderson County,3,2,44,237,259,485,94,283,838,1145,57250
2,Anderson County,3,2,25,153,278,536,64,180,878,1078,57569
3,Anderson County,3,1,8,96,198,531,62,105,791,907,57491
4,Anderson County,3,2,23,100,249,514,82,125,845,984,57657


### Split the Data into Training and Testing

In [6]:
# Create dummies
df = pd.get_dummies(df.drop(columns='County'))
df.head()

Unnamed: 0,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,3,9,38,195,319,749,75,242,1143,1407,57560
1,3,2,44,237,259,485,94,283,838,1145,57250
2,3,2,25,153,278,536,64,180,878,1078,57569
3,3,1,8,96,198,531,62,105,791,907,57491
4,3,2,23,100,249,514,82,125,845,984,57657


In [7]:
# Create target
y = pd.DataFrame(df["Population"])

# Create features
X = pd.DataFrame(df[['Agency_Count','Total_Crime']])

In [8]:
X.describe()

Unnamed: 0,Agency_Count,Total_Crime
count,1512.0,1512.0
mean,4.108466,3320.835317
std,5.483518,15661.906562
min,1.0,0.0
25%,2.0,65.75
50%,3.0,313.0
75%,4.0,1068.75
max,45.0,197686.0


In [9]:
y.head()

Unnamed: 0,Population
0,57560
1,57250
2,57569
3,57491
4,57657


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(1134, 2)

## Oversampling

### Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

# resampling
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [12]:
# Resample the training data with the RandomOversampler
Counter(y_resampled)

Counter({'Population': 1})

In [13]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [15]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         88       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        121       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        257       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        329       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        402       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        404       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        442       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        631       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        653       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        711       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        754       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   

### SMOTE Oversampling

In [18]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

Counter(y_resampled)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

In [None]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

## Undersampling

In [24]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)

X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Population': 1})

In [25]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=78)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [26]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [28]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         88       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        121       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        257       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        329       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        402       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        404       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        442       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        631       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        653       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        711       0.00      0.00      1.00      0.00      0.00      0.00       1.0
        754       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   

## Combination (Over and Under) Sampling

In [30]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6

In [None]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

In [None]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))