In [24]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [25]:
# Load datasets
train_data = pd.read_csv('train.csv')  
test_data = pd.read_csv('test.csv') 

In [26]:
train_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...


In [27]:
test_data.head()

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,RapeGang Rape RGRSexually Abusive Content,,Sir namaskar mein Ranjit Kumar PatraPaise neh...
1,Online Financial Fraud,DebitCredit Card FraudSim Swap Fraud,KOTAK MAHINDRA BANK FRAUD\r\nFRAUD AMOUNT
2,Cyber Attack/ Dependent Crimes,SQL Injection,The issue actually started when I got this ema...
3,Online Financial Fraud,Fraud CallVishing,I am amit kumar from karwi chitrakoot I am tot...
4,Any Other Cyber Crime,Other,I have ordered saree and blouse from rinki s...


In [28]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s]', '', text)  
        return text
    else:
        return ''

In [29]:
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].apply(clean_text)
test_data['crimeaditionalinfo'] = test_data['crimeaditionalinfo'].apply(clean_text)

In [31]:
# Fill NaN values
train_data['sub_category'] = train_data['sub_category'].fillna('Unknown')  # Replace 'Unknown' with a suitable category
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].fillna('')  # Replace with empty string or any suitable default text


In [33]:
print(train_data['sub_category'].value_counts())

sub_category
UPI Related Frauds                                                      26856
Other                                                                   10878
DebitCredit Card FraudSim Swap Fraud                                    10805
Internet Banking Related Fraud                                           8872
Unknown                                                                  6591
Fraud CallVishing                                                        5803
Cyber Bullying  Stalking  Sexting                                        4089
EWallet Related Fraud                                                    4047
FakeImpersonating Profile                                                2299
Profile Hacking Identity Theft                                           2073
Cheating by Impersonation                                                1988
Unauthorised AccessData Breach                                           1114
Online Job Fraud                                   

In [34]:
min_samples = 2

In [35]:
class_counts = train_data['sub_category'].value_counts()
classes_to_keep = class_counts[class_counts >= min_samples].index
train_data = train_data[train_data['sub_category'].isin(classes_to_keep)]

In [36]:
print(train_data['sub_category'].value_counts())

sub_category
UPI Related Frauds                                                      26856
Other                                                                   10878
DebitCredit Card FraudSim Swap Fraud                                    10805
Internet Banking Related Fraud                                           8872
Unknown                                                                  6591
Fraud CallVishing                                                        5803
Cyber Bullying  Stalking  Sexting                                        4089
EWallet Related Fraud                                                    4047
FakeImpersonating Profile                                                2299
Profile Hacking Identity Theft                                           2073
Cheating by Impersonation                                                1988
Unauthorised AccessData Breach                                           1114
Online Job Fraud                                   

In [37]:
train_data, val_data = train_test_split(train_data, test_size=0.33, random_state=42, stratify=train_data['sub_category'])

In [38]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [39]:
# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_data['crimeaditionalinfo'])
X_val = tfidf_vectorizer.transform(val_data['crimeaditionalinfo'])
X_test = tfidf_vectorizer.transform(test_data['crimeaditionalinfo'])

In [40]:
# Target variables
y_train = train_data['sub_category']
y_val = val_data['sub_category']
y_test = test_data['sub_category']

In [42]:
# Function to train and evaluate model
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    print(f"Classification report for {model_name}:\n")
    print(classification_report(y_val, y_pred))

In [43]:
# Naive Bayes
naive_bayes_model = MultinomialNB()
train_and_evaluate_model(naive_bayes_model, X_train, y_train, X_val, y_val, "Naive Bayes")

Classification report for Naive Bayes:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        96
                                           Cheating by Impersonation       0.00      0.00      0.00       656
                                                Cryptocurrency Fraud       1.00      0.03      0.05       158
                                   Cyber Bullying  Stalking  Sexting       0.39      0.65      0.49      1349
                                                     Cyber Terrorism       0.00      0.00      0.00        53
                             Damage to computer computer systems etc       0.00      0.00      0.00        36
                                                   Data Breach/Theft       0.00      0.00      0.00       160
                                DebitCredit Card FraudSim Swap Fraud       0.61      0.54      0.57      3566
         

  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
train_and_evaluate_model(decision_tree_model, X_train, y_train, X_val, y_val, "Decision Tree")

Classification report for Decision Tree:

                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        96
                                           Cheating by Impersonation       0.06      0.05      0.06       656
                                                Cryptocurrency Fraud       0.16      0.13      0.14       158
                                   Cyber Bullying  Stalking  Sexting       0.34      0.35      0.35      1349
                                                     Cyber Terrorism       0.00      0.00      0.00        53
                             Damage to computer computer systems etc       0.00      0.00      0.00        36
                                                   Data Breach/Theft       0.14      0.16      0.15       160
                                DebitCredit Card FraudSim Swap Fraud       0.

In [45]:
random_forest_model = RandomForestClassifier(random_state=42)
train_and_evaluate_model(random_forest_model, X_train, y_train, X_val, y_val, "Random Forest")

Classification report for Random Forest:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        96
                                           Cheating by Impersonation       0.44      0.01      0.01       656
                                                Cryptocurrency Fraud       0.92      0.07      0.13       158
                                   Cyber Bullying  Stalking  Sexting       0.48      0.54      0.51      1349
                                                     Cyber Terrorism       0.00      0.00      0.00        53
                             Damage to computer computer systems etc       0.00      0.00      0.00        36
                                                   Data Breach/Theft       0.13      0.15      0.14       160
                                DebitCredit Card FraudSim Swap Fraud       0.69      0.68      0.68      3566
         

  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
def evaluate_final_model(model, X_test, y_test, model_name):
    y_test = y_test.dropna()
    y_test_pred = model.predict(X_test[y_test.index])  
    print(f"Unique classes in y_test: {set(y_test)}")
    print(f"Unique classes in y_test_pred: {set(y_test_pred)}")
    unknown_classes = set(y_test_pred) - set(y_test)
    if unknown_classes:
        print(f"Warning: Unknown classes predicted: {unknown_classes}")
    valid_classes = set(y_test)  
    predicted_classes = set(y_test_pred)  
    all_classes = valid_classes.intersection(predicted_classes)
    if 'Unknown' in all_classes:
        all_classes.remove('Unknown')
    print(f"Final Classification report for {model_name} on Test Data:\n")
    print(classification_report(y_test, y_test_pred, labels=list(all_classes)))

In [55]:
evaluate_final_model(naive_bayes_model, X_test, y_test, "Naive Bayes")

Unique classes in y_test: {'Website DefacementHacking', 'Unauthorised AccessData Breach', 'Fraud CallVishing', 'UPI Related Frauds', 'Internet Banking Related Fraud', 'Online Matrimonial Fraud', 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks', 'Cyber Bullying  Stalking  Sexting', 'DematDepository Fraud', 'Provocative Speech for unlawful acts', 'EMail Phishing', 'Damage to computer computer systems etc', 'Other', 'Profile Hacking Identity Theft', 'Business Email CompromiseEmail Takeover', 'Cheating by Impersonation', 'Online Job Fraud', 'Online Trafficking', 'Tampering with computer source documents', 'FakeImpersonating Profile', 'Cyber Terrorism', 'Malware Attack', 'Impersonating Email', 'Computer Generated CSAM/CSEM', 'Hacking/Defacement', 'Online Gambling  Betting', 'Cryptocurrency Fraud', 'Intimidating Email', 'EWallet Related Fraud', 'SQL Injection', 'Ransomware Attack', 'DebitCredit Card FraudSim Swap Fraud', 'Sexual Harassment', 'Ransomware', 'Data Breach/T

In [56]:
evaluate_final_model(decision_tree_model, X_test, y_test, "Decision Tree")

Unique classes in y_test: {'Website DefacementHacking', 'Unauthorised AccessData Breach', 'Fraud CallVishing', 'UPI Related Frauds', 'Internet Banking Related Fraud', 'Online Matrimonial Fraud', 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks', 'Cyber Bullying  Stalking  Sexting', 'DematDepository Fraud', 'Provocative Speech for unlawful acts', 'EMail Phishing', 'Damage to computer computer systems etc', 'Other', 'Profile Hacking Identity Theft', 'Business Email CompromiseEmail Takeover', 'Cheating by Impersonation', 'Online Job Fraud', 'Online Trafficking', 'Tampering with computer source documents', 'FakeImpersonating Profile', 'Cyber Terrorism', 'Malware Attack', 'Impersonating Email', 'Computer Generated CSAM/CSEM', 'Hacking/Defacement', 'Online Gambling  Betting', 'Cryptocurrency Fraud', 'Intimidating Email', 'EWallet Related Fraud', 'SQL Injection', 'Ransomware Attack', 'DebitCredit Card FraudSim Swap Fraud', 'Sexual Harassment', 'Ransomware', 'Data Breach/T

In [57]:
evaluate_final_model(random_forest_model, X_test, y_test, "Random Forest")

Unique classes in y_test: {'Website DefacementHacking', 'Unauthorised AccessData Breach', 'Fraud CallVishing', 'UPI Related Frauds', 'Internet Banking Related Fraud', 'Online Matrimonial Fraud', 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks', 'Cyber Bullying  Stalking  Sexting', 'DematDepository Fraud', 'Provocative Speech for unlawful acts', 'EMail Phishing', 'Damage to computer computer systems etc', 'Other', 'Profile Hacking Identity Theft', 'Business Email CompromiseEmail Takeover', 'Cheating by Impersonation', 'Online Job Fraud', 'Online Trafficking', 'Tampering with computer source documents', 'FakeImpersonating Profile', 'Cyber Terrorism', 'Malware Attack', 'Impersonating Email', 'Computer Generated CSAM/CSEM', 'Hacking/Defacement', 'Online Gambling  Betting', 'Cryptocurrency Fraud', 'Intimidating Email', 'EWallet Related Fraud', 'SQL Injection', 'Ransomware Attack', 'DebitCredit Card FraudSim Swap Fraud', 'Sexual Harassment', 'Ransomware', 'Data Breach/T

In [60]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [61]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

In [62]:
train_and_evaluate_model(svm_model, X_train, y_train, X_val, y_val, "Support Vector Machine")

Classification report for Support Vector Machine:



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                                      precision    recall  f1-score   support

                             Business Email CompromiseEmail Takeover       0.00      0.00      0.00        96
                                           Cheating by Impersonation       0.15      0.01      0.01       656
                                                Cryptocurrency Fraud       0.59      0.44      0.51       158
                                   Cyber Bullying  Stalking  Sexting       0.48      0.54      0.50      1349
                                                     Cyber Terrorism       0.00      0.00      0.00        53
                             Damage to computer computer systems etc       0.00      0.00      0.00        36
                                                   Data Breach/Theft       0.21      0.03      0.04       160
                                DebitCredit Card FraudSim Swap Fraud       0.70      0.67      0.69      3566
         

  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
evaluate_final_model(svm_model, X_test, y_test, "Support Vector Machine")

Unique classes in y_test: {'Website DefacementHacking', 'Unauthorised AccessData Breach', 'Fraud CallVishing', 'UPI Related Frauds', 'Internet Banking Related Fraud', 'Online Matrimonial Fraud', 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks', 'Cyber Bullying  Stalking  Sexting', 'DematDepository Fraud', 'Provocative Speech for unlawful acts', 'EMail Phishing', 'Damage to computer computer systems etc', 'Other', 'Profile Hacking Identity Theft', 'Business Email CompromiseEmail Takeover', 'Cheating by Impersonation', 'Online Job Fraud', 'Online Trafficking', 'Tampering with computer source documents', 'FakeImpersonating Profile', 'Cyber Terrorism', 'Malware Attack', 'Impersonating Email', 'Computer Generated CSAM/CSEM', 'Hacking/Defacement', 'Online Gambling  Betting', 'Cryptocurrency Fraud', 'Intimidating Email', 'EWallet Related Fraud', 'SQL Injection', 'Ransomware Attack', 'DebitCredit Card FraudSim Swap Fraud', 'Sexual Harassment', 'Ransomware', 'Data Breach/T