In [None]:
pip install scikit-learn pandas joblib



In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Load your dataset (assuming CSV format and that your dataset has 'processed_text' and 'label' columns)
data = pd.read_csv('/content/train.csv')  # Replace 'your_dataset.csv' with your file path

In [None]:
# Step 1: Data Cleaning
data.dropna(subset=['crimeaditionalinfo'], inplace=True)
data.drop_duplicates(inplace=True)

In [None]:
# Check unique values in each column
print(data['category'].value_counts())
print(data['sub_category'].value_counts())

category
Online Financial Fraud                                  52496
Online and Social Media Related Crime                   12076
Any Other Cyber Crime                                   10811
Cyber Attack/ Dependent Crimes                           3608
Sexually Obscene material                                1764
Hacking  Damage to computercomputer system etc           1709
Sexually Explicit Act                                    1489
Cryptocurrency Crime                                      473
Online Gambling  Betting                                  444
Child Pornography CPChild Sexual Abuse Material CSAM      357
RapeGang Rape RGRSexually Abusive Content                 248
Online Cyber Trafficking                                  183
Cyber Terrorism                                           161
Ransomware                                                 56
Report Unlawful Content                                     1
Name: count, dtype: int64
sub_category
UPI Related Frauds    

In [None]:
# Step 2: Text Preprocessing
# Define a function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

In [None]:
# Apply the function to the 'crimeaditionalinfo' column
data['cleaned_text'] = data['crimeaditionalinfo'].apply(clean_text)

In [None]:
# Tokenization, Stopword Removal, and Stemming
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
data['processed_text'] = data['cleaned_text'].apply(preprocess_text)

In [None]:
# Vectorize the processed text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.9)
X = vectorizer.fit_transform(data['processed_text'])
y = data['category']  # Assuming we want to predict 'category'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Random Forest Classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred = classifier.predict(X_test)

# Calculate accuracy and print classification report
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.7412
Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.62      0.11      0.19      2116
Child Pornography CPChild Sexual Abuse Material CSAM       0.57      0.12      0.20        68
                                Cryptocurrency Crime       0.78      0.07      0.13        95
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       718
                                     Cyber Terrorism       0.00      0.00      0.00        44
      Hacking  Damage to computercomputer system etc       0.69      0.08      0.14       375
                            Online Cyber Trafficking       0.00      0.00      0.00        36
                              Online Financial Fraud       0.76      0.98      0.86     10520
                            Online Gambling  Betting       0.50      0.01      0.02        91
               Online and Social Media Related Crime       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
