In [None]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load Data
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

# Use 'crimeaditionalinfo' column instead of 'description'
description_column = 'crimeaditionalinfo'

# Preprocessing Function
def preprocess_text(text):
    # Check if text is a string, if not, convert to string
    if not isinstance(text, str):
        text = str(text)
    text = re.sub(r'[^\w\s]', '', text.lower())  # Lowercase & Remove punctuation
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

# Apply preprocessing
train_data['processed_text'] = train_data[description_column].apply(preprocess_text)
test_data['processed_text'] = test_data[description_column].apply(preprocess_text)

# Combine train and test data to fit LabelEncoder on all categories
all_categories = pd.concat([train_data['category'], test_data['category']])
label_encoder = LabelEncoder()
label_encoder.fit(all_categories)

# Encode Labels
y_train = label_encoder.transform(train_data['category'])
y_test = label_encoder.transform(test_data['category'])

# Vectorization and Model Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Train Model
pipeline.fit(train_data['processed_text'], y_train)

# Predict and Evaluate
y_pred = pipeline.predict(test_data['processed_text'])

# Get unique labels in predictions and true values
unique_labels = sorted(set(y_test) | set(y_pred))

# Filter target names to match unique labels
target_names = [label_encoder.classes_[i] for i in unique_labels]

# Generate classification report with filtered target names
print(classification_report(y_test, y_pred, target_names=target_names, labels=unique_labels))

# Save Predictions
test_data['predicted_category'] = label_encoder.inverse_transform(y_pred)

# Check if 'id' column exists, if not, print available columns
if 'id' not in test_data.columns:
    print(f"'id' column not found. Available columns: {test_data.columns.tolist()}")
    # If 'id' is not present, you can use the index as 'id'
    test_data['id'] = test_data.index  # Assign index to 'id' column

# Now save the predictions with 'id'
test_data[['id', 'predicted_category']].to_csv('/content/predictions.csv', index=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.43      0.23      0.30      3670
Child Pornography CPChild Sexual Abuse Material CSAM       0.65      0.25      0.36       123
                      Crime Against Women & Children       0.00      0.00      0.00         4
                                Cryptocurrency Crime       0.68      0.44      0.53       166
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        52
      Hacking  Damage to computercomputer system etc       0.42      0.22      0.29       592
                            Online Cyber Trafficking       0.00      0.00      0.00        61
                              Online Financial Fraud       0.81      0.95      0.87     18896
                            Online Gambling  Betting       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List, Optional
import joblib
import re

app = FastAPI()

# Load pre-trained model and label encoder
model = joblib.load('crime_classifier_model.pkl')  # Replace with actual model path
label_encoder = joblib.load('label_encoder.pkl')

# Define Request Schema
class ReportRequest(BaseModel):
    description: str

# Endpoint for Classifying Report
@app.post("/classify/")
def classify_report(report: ReportRequest):
    # Preprocess the text (this should match the training pre-processing)
    processed_text = preprocess_text(report.description)
    category_pred = model.predict([processed_text])
    category_name = label_encoder.inverse_transform(category_pred)[0]

    return {"description": report.description, "predicted_category": category_name}

# Helper function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Run the app: uvicorn main:app --reload