In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
# Load the dataset (assuming the dataset is stored in a DataFrame called 'data')
# Replace 'data.csv' with the actual file path
data = pd.read_csv('crime.csv')

In [3]:
# Sample 1% of the dataset
sampled_data = data.sample(frac=0.01, random_state=42)

In [4]:
# Split the data into features (X) and target variable (y)
X = sampled_data['Descript']
y = sampled_data['Category']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Vectorize the text data using TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
# Train a decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train_vec, y_train)

DecisionTreeClassifier()

In [8]:
# Evaluate the model
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

                             precision    recall  f1-score   support

                    ASSAULT       1.00      0.99      0.99       147
                 BAD CHECKS       0.75      1.00      0.86         3
                   BURGLARY       1.00      1.00      1.00        76
         DISORDERLY CONDUCT       1.00      1.00      1.00        10
DRIVING UNDER THE INFLUENCE       1.00      1.00      1.00         7
              DRUG/NARCOTIC       0.98      0.99      0.99       111
                DRUNKENNESS       1.00      1.00      1.00        16
            FAMILY OFFENSES       0.00      0.00      0.00         2
     FORGERY/COUNTERFEITING       1.00      0.95      0.98        21
                      FRAUD       1.00      1.00      1.00        31
                 KIDNAPPING       0.83      1.00      0.91         5
              LARCENY/THEFT       1.00      1.00      1.00       334
                LIQUOR LAWS       1.00      1.00      1.00         1
                  LOITERING      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Example prediction
new_description = ["Robbery at main street"]
new_description_vec = vectorizer.transform(new_description)
predicted_category = clf.predict(new_description_vec)
print("Predicted category:", predicted_category)

Predicted category: ['ROBBERY']
