In [None]:
pip install scikit-learn nltk



In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils import class_weight

In [None]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

# Load your dataset (assuming CSV format and that your dataset has 'processed_text' and 'label' columns)
data = pd.read_csv('/content/train.csv')  # Replace 'your_dataset.csv' with your file path

# Step 1: Data Cleaning
data.dropna(subset=['crimeaditionalinfo'], inplace=True)
data.drop_duplicates(inplace=True)

# Check unique values in each column
print(data['category'].value_counts())
print(data['sub_category'].value_counts())

category
Online Financial Fraud                                  18584
Online and Social Media Related Crime                    4219
Any Other Cyber Crime                                    3779
Cyber Attack/ Dependent Crimes                           1279
Sexually Obscene material                                 624
Hacking  Damage to computercomputer system etc            606
Sexually Explicit Act                                     555
Cryptocurrency Crime                                      174
Online Gambling  Betting                                  158
Child Pornography CPChild Sexual Abuse Material CSAM      121
RapeGang Rape RGRSexually Abusive Content                 101
Online Cyber Trafficking                                   67
Cyber Terrorism                                            56
Ransomware                                                 15
Name: count, dtype: int64
sub_category
UPI Related Frauds                                                      8335
Other  

In [None]:
# Step 2: Text Preprocessing
# Define a function to clean text
def clean_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

# Apply the function to the 'crimeaditionalinfo' column
data['cleaned_text'] = data['crimeaditionalinfo'].apply(clean_text)

In [None]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [None]:
data['processed_text'] = data['cleaned_text'].apply(preprocess_text)

In [None]:
# Vectorize the processed text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2, max_df=0.9)
X = vectorizer.fit_transform(data['processed_text'])
y = data['category']  # Assuming we want to predict 'category'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Calculate class weights to handle imbalance based on y_train
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(zip(np.unique(y_train), class_weights))  # Create a dictionary from classes to weights

In [None]:
# Check computed class weights
print("Computed Class Weights:", class_weights_dict)

Computed Class Weights: {'Any Other Cyber Crime': 0.569317382125264, 'Child Pornography CPChild Sexual Abuse Material CSAM': 18.058035714285715, 'Cryptocurrency Crime': 12.562111801242237, 'Cyber Attack/ Dependent Crimes': 1.6765681127383254, 'Cyber Terrorism': 40.3156146179402, 'Hacking  Damage to computercomputer system etc': 3.567019400352734, 'Online Cyber Trafficking': 35.37900874635569, 'Online Financial Fraud': 0.11710155556413325, 'Online Gambling  Betting': 13.233369683751363, 'Online and Social Media Related Crime': 0.5077830780818479, 'Ransomware': 144.46428571428572, 'RapeGang Rape RGRSexually Abusive Content': 22.5139146567718, 'Sexually Explicit Act': 3.939935064935065, 'Sexually Obscene material': 3.460222412318221}


In [None]:
# SVM Model
svm_classifier = SVC(class_weight=class_weights_dict, kernel='linear', random_state=42)
svm_classifier.fit(X_train, y_train)

In [None]:
# Logistic Regression Model
log_reg_classifier = LogisticRegression(class_weight='balanced', max_iter=1000)  # Use 'balanced' directly
log_reg_classifier.fit(X_train, y_train)

In [None]:
# Predictions
y_pred_svm = svm_classifier.predict(X_test)
y_pred_log_reg = log_reg_classifier.predict(X_test)

In [None]:
# Evaluate SVM Model
print("SVM Model Evaluation:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred_svm))

SVM Model Evaluation:
Accuracy: 0.6157
Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.28      0.55      0.37       734
Child Pornography CPChild Sexual Abuse Material CSAM       0.32      0.24      0.27        25
                                Cryptocurrency Crime       0.27      0.61      0.37        36
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       245
                                     Cyber Terrorism       0.00      0.00      0.00        13
      Hacking  Damage to computercomputer system etc       0.23      0.55      0.32       120
                            Online Cyber Trafficking       0.00      0.00      0.00        18
                              Online Financial Fraud       0.95      0.68      0.79      3780
                            Online Gambling  Betting       0.04      0.07      0.05        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Evaluate Logistic Regression Model
print("Logistic Regression Model Evaluation:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_log_reg):.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred_log_reg))

Logistic Regression Model Evaluation:
Accuracy: 0.6112
Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.29      0.49      0.37       734
Child Pornography CPChild Sexual Abuse Material CSAM       0.16      0.28      0.20        25
                                Cryptocurrency Crime       0.21      0.72      0.33        36
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00       245
                                     Cyber Terrorism       0.00      0.00      0.00        13
      Hacking  Damage to computercomputer system etc       0.21      0.60      0.31       120
                            Online Cyber Trafficking       0.00      0.00      0.00        18
                              Online Financial Fraud       0.95      0.69      0.80      3780
                            Online Gambling  Betting       0.05      0.26  