# Crime Statistics Resampling Techniques

### Import Dependencies

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

### Read the CSV and Perform Basic Data Cleaning

In [3]:
# Load the data
df = pd.read_csv('../Resources/CSV/crime_data.csv')

In [4]:
# Drop the year column to not skew data with year count and averages
df = df.drop(columns =['Year'])
df.head()

Unnamed: 0,County,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,5,0,5,38,80,147,29,43,256,302,30009


In [5]:
# Convert the target column values to violent and nonviolent based on their crime
x = dict.fromkeys(['Murder', 'Rape', 'Assault'], 'violent_crimes')    
df = df.replace(x)

x = dict.fromkeys(['Burglary', 'Larceny', 'Auto_Theft'], 'nonviolent_crimes')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,County,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,Austin County,5,0,8,53,121,283,13,61,417,482,29354
1,Austin County,5,0,6,43,136,239,31,49,406,466,29718
2,Austin County,5,0,11,34,98,183,45,45,326,376,29963
3,Austin County,5,0,11,32,82,138,29,43,249,295,29912
4,Austin County,5,0,5,38,80,147,29,43,256,302,30009


### Split the Data into Training and Testing

In [6]:
# Create dummies
df = pd.get_dummies(df.drop(columns='County'))
df.head()

Unnamed: 0,Agency_Count,Murder,Rape,Assault,Burglary,Larceny,Auto_Theft,Violent_Offenses,NonViolent_Offenses,Total_Crime,Population
0,5,0,8,53,121,283,13,61,417,482,29354
1,5,0,6,43,136,239,31,49,406,466,29718
2,5,0,11,34,98,183,45,45,326,376,29963
3,5,0,11,32,82,138,29,43,249,295,29912
4,5,0,5,38,80,147,29,43,256,302,30009


In [7]:
# Create target
y = pd.DataFrame(df["Population"])

# Create features
X = pd.DataFrame(df[['Agency_Count','Total_Crime']])

In [8]:
X.describe()

Unnamed: 0,Agency_Count,Total_Crime
count,102.0,102.0
mean,8.617647,14348.470588
std,10.573687,45215.295287
min,1.0,281.0
25%,3.0,820.75
50%,4.0,1166.0
75%,11.0,6549.5
max,45.0,197686.0


In [9]:
y.head()

Unnamed: 0,Population
0,29354
1,29718
2,29963
3,29912
4,30009


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(76, 2)

## Oversampling

### Naive Random Oversampling

In [11]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)

# resampling
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [12]:
# Resample the training data with the RandomOversampler
Counter(y_resampled)

Counter({'Population': 1})

In [13]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [15]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [16]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 

In [17]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      27379       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      27822       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28219       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28406       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      35066       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36010       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36674       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41221       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41445       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      46340       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      50195       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   

### SMOTE Oversampling

In [18]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Population': 1})

In [19]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [20]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [21]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.0

In [22]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 

In [23]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      27379       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      27822       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28219       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28406       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      35066       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36010       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36674       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41221       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41445       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      46340       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      50195       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   

## Undersampling

In [24]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)

X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Population': 1})

In [25]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=78)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [26]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [27]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [28]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0],
       [0, 0, 0, 0, 

In [29]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

      27379       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      27822       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28219       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      28406       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      35066       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36010       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      36674       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41221       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      41445       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      46340       0.00      0.00      1.00      0.00      0.00      0.00       1.0
      50195       0.00      0.00      1.00      0.00      0.00      0.00       1.0
   

## Combination (Over and Under) Sampling

In [30]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)

X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

Counter({'Population': 1})

In [31]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required.

In [None]:
# Create Model Prediction
y_pred = model.predict(X_test)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))