In [1]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('crime.csv')

# Sample 1% of the dataset
data_sampled = data.sample(frac=0.01, random_state=42)

In [2]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text.split()

# Preprocess the Descript column
data_sampled['Descript'] = data_sampled['Descript'].apply(preprocess_text)

In [3]:
# Split the data into train and test sets
train_size = int(0.8 * len(data_sampled))
train_data = data_sampled[:train_size]
test_data = data_sampled[train_size:]

In [4]:
# Calculate prior probabilities
prior_probs = train_data['Category'].value_counts(normalize=True)

# Calculate likelihoods with Laplace smoothing
word_counts = {category: {} for category in train_data['Category'].unique()}
total_words = {category: 0 for category in train_data['Category'].unique()}

In [5]:
for index, row in train_data.iterrows():
    category = row['Category']
    for word in row['Descript']:
        word_counts[category][word] = word_counts[category].get(word, 0) + 1
        total_words[category] += 1

In [6]:
# Train the Naive Bayes model
class NaiveBayesClassifier:
    def __init__(self, prior_probs, word_counts, total_words):
        self.prior_probs = prior_probs
        self.word_counts = word_counts
        self.total_words = total_words

    def predict(self, description):
        log_probs = {category: np.log(self.prior_probs[category]) for category in self.prior_probs.index}
        for word in description:
            for category in self.word_counts.keys():
                log_probs[category] += np.log((self.word_counts[category].get(word, 0) + 1) / (self.total_words[category] + len(self.word_counts[category])))
        return max(log_probs, key=log_probs.get)

In [7]:
# Instantiate the Naive Bayes model
nb_model = NaiveBayesClassifier(prior_probs, word_counts, total_words)

In [8]:
# Predict on the test set
predictions = test_data['Descript'].apply(nb_model.predict)

In [9]:
# Calculate accuracy
accuracy = (predictions == test_data['Category']).mean()
print("Accuracy:", accuracy)

Accuracy: 0.791002277904328


In [16]:
# Predict the type of crime associated with the example description
example_description = "Printing fraudelent cash"
example_tokens = preprocess_text(example_description)
predicted_crime = nb_model.predict(example_tokens)
print("Predicted crime:", predicted_crime)

Predicted crime: EXTORTION
