In [1]:
# Data analysis tools
import re
import os
import joblib
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("final_train_dataset.csv")
df

Unnamed: 0,category,sub_category,sub_category_names,category_names,retagged_sub_category,retagged_category,content_processed
0,cyber attack/ dependent crimes,denial of service (dos)/distributed denial of ...,email_hacking,other_cyber_crime,email_hacking,hacking_damage_to_computercomputer_system_etc,i never imagined something like this could hap...
1,online financial fraud,ewallet related fraud,other,other_cyber_crime,other,any_other_cyber_crime,dear sir please refer the above information th...
2,cyber attack/ dependent crimes,tampering with computer source documents,unauthorised_accessdata_breach,other_cyber_crime,unauthorised_accessdata_breach,hacking_damage_to_computercomputer_system_etc,i never imagined something like this could hap...
3,online and social media related crime,profile hacking identity theft,other,other_cyber_crime,other,any_other_cyber_crime,ravi kant yadav
4,online and social media related crime,cyber bullying stalking sexting,other,other_cyber_crime,other,any_other_cyber_crime,whatsapp you tube please help me please you tube
...,...,...,...,...,...,...,...
86821,,,hacking_defacement,other_cyber_crime,hacking_defacement,cyber_attack_dependent_crimes,password kaliwell publisherfra slack ho c tiny...
86822,,,cyber_terrorism,other_cyber_crime,cyber_terrorism,cyber_terrorism,police officer xyz blogging deputywyonal062 to...
86823,,,ransomware_attack,other_cyber_crime,ransomware_attack,cyber_attack_dependent_crimes,badunicationtheta emails am receiving a report...
86824,,,email_phishing,other_cyber_crime,email_phishing,online_and_social_media_related_crime,nown others miliondoconnect g ng g i l nh l sm...


In [3]:
rows_to_remove = [
  "cyber_blackmailing_threatening",
  "backdoor_attacks",
  "dematdepository_fraud",
  "impersonating_email",
  "password_attacks",
  "attacks_on_applications_eg_e_governance_e_commerce_",
  "website_defacementhacking"
]

def filter_dataset(df, categories_to_remove):
    """
    Remove rows from a DataFrame where the retagged_sub_category matches any value in the given list
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame containing retagged_sub_category column
    categories_to_remove (list): List of categories to remove
    
    Returns:
    pandas.DataFrame: Filtered DataFrame with specified categories removed
    """
    # Create a mask for rows to keep (where category is not in the removal list)
    mask = ~df['retagged_sub_category'].isin(categories_to_remove)
    
    # Apply the mask and return the filtered DataFrame
    return df[mask]

df = filter_dataset(df, rows_to_remove)

In [4]:
df["retagged_sub_category"].value_counts()

retagged_sub_category
upi_related_frauds                                                  23930
debitcredit_card_fraudsim_swap_fraud                                 9184
internet_banking_related_fraud                                       8295
cyber_bullying_stalking_sexting                                      7001
unauthorised_accessdata_breach                                       5375
against_interest_of_sovereignty_or_integrity_of_india                4823
ewallet_related_fraud                                                4662
cheating_by_impersonation                                            3455
profile_hacking_identity_theft                                       2884
fraud_callvishing                                                    2566
online_job_fraud                                                     1711
hacking_defacement                                                   1069
other                                                                 997
denial_of_servic

In [5]:
master_mapper = {
    "any other cyber crime": {
        "other": [
            "other",
            "supply chain attacks"
        ]
    },
    "child pornography cpchild sexual abuse material csam": {
        "child pornography cpchild sexual abuse material csam": [
            "child pornography cpchild sexual abuse material csam"
        ]
    },
    "crime against women & children": {
        "sexual harassment": [
            "sexual harassment"
        ],
        "computer generated csam/csem": [
            "computer generated csam/csem"
        ]
    },
    "cryptocurrency crime": {
        "cryptocurrency fraud": [
            "cryptocurrency fraud"
        ]
    },
    "cyber attack/ dependent crimes": {
        "sql injection": [
            "sql injection"
        ],
        "ransomware attack": [
            "ransomware attack"
        ],
        "malware attack": [
            "malware attack",
            "malicious code attacks (specifically mentioning virus, worm, trojan, bots, spyware, cryptominers)"
        ],
        "data breach/theft": [
            "data breach/theft",
            "data leaks"
        ],
        "hacking/defacement": [
            "hacking/defacement",
            "zero-day exploits",
            "malicious mobile app attacks"
        ],
        "denial of service (dos)/distributed denial of service (ddos) attacks": [
            "denial of service (dos)/distributed denial of service (ddos) attacks"
        ],
        "tampering with computer source documents": [
            "tampering with computer source documents"
        ]
    },
    "cyber terrorism": {
        "cyber terrorism": [
            "cyber terrorism",
            "cyber espionage"
        ]
    },
    "hacking  damage to computercomputer system etc": {
        "email hacking": [
            "email hacking"
        ],
        "unauthorised accessdata breach": [
            "unauthorised accessdata breach",
            "compromise of critical systems/information",
            "targeted scanning/probing of critical networks/systems",
            "attacks on servers (database mail dns) and network devices (routers)",
            "attacks on critical infrastructure, scada, operational technology systems, and wireless networks",
            "attacks or suspicious activities affecting cloud computing systems servers software and applications",
            "attacks or malicious suspicious activities affecting systems related to big data blockchain virtual assets and robotics",
            "attacks on internet of things (iot) devices and associated systems, networks, and servers",
            "attacks on systems related to artificial intelligence (ai) and machine learning (ml)"
        ],
        "damage to computer computer systems etc": [
            "damage to computer computer systems etc"
        ],
        "website defacementhacking": [
            "web application vulnerabilities",
        ]
    },
    "online cyber trafficking": {
        "online trafficking": [
            "online trafficking"
        ]
    },
    "online financial fraud": {
        "upi related frauds": [
            "upi related frauds",
            "aadhar enabled payment system (aeps) fraud"
        ],
        "business email compromiseemail takeover": [
            "business email compromiseemail takeover"
        ],
        "debitcredit card fraudsim swap fraud": [
            "debitcredit card fraudsim swap fraud"
        ],
        "ewallet related fraud": [
            "ewallet related fraud"
        ],
        "fraud callvishing": [
            "fraud callvishing"
        ],
        "internet banking related fraud": [
            "internet banking related fraud",
            "attacks or incidents affecting digital payment systems"
        ]
    },
    "online gambling  betting": {
        "online gambling  betting": [
            "online gambling  betting"
        ]
    },
    "online and social media related crime": {
        "intimidating email": [
            "intimidating email"
        ],
        "provocative speech for unlawful acts": [
            "provocative speech for unlawful acts"
        ],
        "email phishing": [
            "email phishing"
        ],
        "online job fraud": [
            "online job fraud"
        ],
        "profile hacking identity theft": [
            "profile hacking identity theft",
            "identity theft, spoofing, and phishing attacks",
            "unauthorized social media access"
        ],
        "cheating by impersonation": [
            "cheating by impersonation",
            "fake mobile apps"
        ],
        "online matrimonial fraud": [
            "online matrimonial fraud"
        ],
        "cyber bullying  stalking  sexting": [
            "cyber bullying  stalking  sexting"
        ],
        "fakeimpersonating profile": [
            "fakeimpersonating profile"
        ]
    },
    "rapegang rape rgrsexually abusive content": {
        "rapegang rape rgrsexually abusive content": [
            "rapegang rape rgrsexually abusive content"
        ]
    },
    "report unlawful content": {
        "against interest of sovereignty or integrity of india": [
            "against interest of sovereignty or integrity of india",
            "disinformation or misinformation campaigns"
        ]
    },
    "sexually explicit act": {
        "sexually explicit act": [
            "sexually explicit act"
        ]
    },
    "sexually obscene material": {
        "sexually obscene material": [
            "sale publishing and transmitting obscene material/sexually explicit material",
            "sexually obscene material"
        ]
    }
}

def clean_json_mapping(json_mapping):
    """
    Cleans a JSON mapping by replacing special characters and spaces with * or _,
    and removing consecutive special characters.
    
    Args:
        json_mapping (dict): Input JSON mapping to clean
        
    Returns:
        dict: Cleaned JSON mapping
    """
    def clean_string(s):
        if not isinstance(s, str):
            return s
        
        # Replace spaces with underscore
        s = s.replace(' ', '_')
        
        # Replace special characters with asterisk
        s = re.sub(r'[^a-zA-Z0-9_.]', '*', s)
        s = s.replace(".","")
        
        # Remove consecutive special characters
        s = re.sub(r'[*_]+', lambda m: '_' if '_' in m.group() else '_', s)
        
        return s
    
    def process_value(value):
        if isinstance(value, dict):
            return {clean_string(k): process_value(v) for k, v in value.items()}
        elif isinstance(value, list):
            return [clean_string(item) for item in value]
        else:
            return clean_string(value)
    
    return process_value(json_mapping)

# Example usage with your mapping
master_mapper = clean_json_mapping(master_mapper)

In [6]:
class CyberCrimeClassifier:
    def __init__(self, df):
        # Load data
        self.df = df
        
        # Mappings
        self.category_names_to_category = clean_json_mapping({
            "women/child related crime": [
                "child pornography cpchild sexual abuse material csam",
                "crime against women & children",
                "online cyber trafficking",
                "rapegang rape rgrsexually abusive content",
                "sexually explicit act",
                "sexually obscene material"
            ],
            "financial fraud crimes": [
                "cryptocurrency crime",
                "online financial fraud",
                "online gambling  betting"
            ],
            "other cyber crime": [
                "any other cyber crime",
                "cyber attack/ dependent crimes",
                "cyber terrorism",
                "hacking  damage to computercomputer system etc",
                "online and social media related crime",
                "report unlawful content"
            ]
        })

        self.category_to_sub_category = clean_json_mapping({
            "any other cyber crime": [
                "other",
                "supply chain attacks"
            ],
            "child pornography cpchild sexual abuse material csam": [
                "child pornography cpchild sexual abuse material csam"
            ],
            "crime against women & children": [
                "sexual harassment",
                "computer generated csam/csem"
            ],
            "cryptocurrency crime": [
                "cryptocurrency fraud"
            ],
            "cyber attack/ dependent crimes": [
                "sql injection",
                "ransomware attack",
                "malware attack",
                "malicious code attacks (specifically mentioning virus, worm, trojan, bots, spyware, cryptominers)",
                "data breach/theft",
                "data leaks",
                "hacking/defacement",
                "zero-day exploits",
                "malicious mobile app attacks",
                "denial of service (dos)/distributed denial of service (ddos) attacks",
                "tampering with computer source documents"
            ],
            "cyber terrorism": [
                "cyber terrorism",
                "cyber espionage"
            ],
            "hacking  damage to computercomputer system etc": [
                "email hacking",
                "unauthorised accessdata breach",
                "compromise of critical systems/information",
                "targeted scanning/probing of critical networks/systems",
                "attacks on servers (database mail dns) and network devices (routers)",
                "attacks on critical infrastructure, scada, operational technology systems, and wireless networks",
                "attacks or suspicious activities affecting cloud computing systems servers software and applications",
                "attacks or malicious suspicious activities affecting systems related to big data blockchain virtual assets and robotics",
                "attacks on internet of things (iot) devices and associated systems, networks, and servers",
                "attacks on systems related to artificial intelligence (ai) and machine learning (ml)",
                "damage to computer computer systems etc",
                "web application vulnerabilities",
            ],
            "online cyber trafficking": [
                "online trafficking"
            ],
            "online financial fraud": [
                "upi related frauds",
                "aadhar enabled payment system (aeps) fraud",
                "business email compromiseemail takeover",
                "debitcredit card fraudsim swap fraud",
                "ewallet related fraud",
                "fraud callvishing",
                "internet banking related fraud",
                "attacks or incidents affecting digital payment systems"
            ],
            "online gambling  betting": [
                "online gambling  betting"
            ],
            "online and social media related crime": [
                "intimidating email",
                "provocative speech for unlawful acts",
                "email phishing",
                "online job fraud",
                "profile hacking identity theft",
                "identity theft, spoofing, and phishing attacks",
                "unauthorized social media access",
                "cheating by impersonation",
                "fake mobile apps",
                "online matrimonial fraud",
                "cyber bullying  stalking  sexting",
                "fakeimpersonating profile"
            ],
            "rapegang rape rgrsexually abusive content": [
                "rapegang rape rgrsexually abusive content"
            ],
            "report unlawful content": [
                "against interest of sovereignty or integrity of india",
                "disinformation or misinformation campaigns"
            ],
            "sexually explicit act": [
                "sexually explicit act"
            ],
            "sexually obscene material": [
                "sale publishing and transmitting obscene material/sexually explicit material"
            ]
        })
            

        self.encoder = SentenceTransformer('paraphrase-MiniLM-L3-v2')
        self.models = {}
        self.label_encoders = {}
        self.selectors = {}
        self.n_features = 100  # Set constant number of features

    def preprocess_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        return text

    def train_classification_model(self, X_balanced, y):
        """
        Enhanced training function with undersampling and fixed feature count
        
        Parameters:
        -----------
        X_resampled : array-like
            Feature matrix
        y : array-like
            Target labels
        model_name : str
            Name of the model
        model_type : str
            Type of model
        sampling_strategy : str or dict, default='auto'
            Sampling strategy for undersampling:
            - 'auto': All classes are sampled to match minority class
            - 'majority': Only majority class is undersampled
            - dict: Specific ratio for each class
        sampling_threshold : int, optional
            If set, only classes with more than this many samples will be undersampled
        """
        # Encode labels
        le = LabelEncoder()
        y_balanced = le.fit_transform(y)

        # Print initial class distribution
        unique, counts = np.unique(y, return_counts=True)
        class_distribution = dict(zip(unique, counts))
        print("\nInitial Class Distribution:")
        for cls, count in class_distribution.items():
            print(f"Class {cls}: {count} samples ({count/len(y)*100:.2f}%)")

        # Feature selection with fixed number of features
        selector = SelectKBest(score_func=f_classif, k=300)
        X_selected = selector.fit_transform(X_balanced, y_balanced)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_selected, y_balanced, 
            test_size=0.2,
            stratify=y_balanced,
            random_state=42
        )

        # Define models
        models = {
            'rf': RandomForestClassifier(
                n_estimators=200,
                max_depth=30,
                min_samples_split=5,
                min_samples_leaf=2,
                max_features='log2',
                class_weight='balanced',
                n_jobs=-1,
                random_state=42
            )
        }

        # Train and evaluate models
        best_score = 0
        best_model = None

        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            
            # Convert predictions back to original labels for reporting
            y_test_original = le.inverse_transform(y_test)
            y_pred_original = le.inverse_transform(y_pred)
            
            report = classification_report(y_test_original, y_pred_original, output_dict=True)
            weighted_f1 = report['weighted avg']['f1-score']
            
            if weighted_f1 > best_score:
                best_score = weighted_f1
                best_model = model

            print(f"Weighted F1-Score: {weighted_f1:.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test_original, y_pred_original))
            
            cm = confusion_matrix(y_test_original, y_pred_original)
            print("\nConfusion Matrix:")
            print(cm)

        return best_model, le, selector

    def train_all_models(self, save_path='models/'):
        os.makedirs(save_path, exist_ok=True)
        
        # Vectorize all texts once
        print("Vectorizing all texts...")
        # all_vectors = self.encoder.encode(
        #     self.df['content_processed'].tolist(), 
        #     show_progress_bar=True
        # )

        #  Saving
        # np.save('embeddings.npy', all_vectors)

        all_vectors = np.load('embeddings.npy')
        
        # Train main category model
        print("\nTraining main category model...")
        main_model, main_le, main_selector = self.train_classification_model(
            all_vectors,
            self.df['category_names']
        )

        # Store and save main category model
        self.models['category_names'] = main_model
        self.label_encoders['category_names'] = main_le
        self.selectors['category_names'] = main_selector
        
        joblib.dump(main_model, f'{save_path}category_names_model.joblib')
        joblib.dump(main_le, f'{save_path}category_names_encoder.joblib')
        joblib.dump(main_selector, f'{save_path}category_names_selector.joblib')

        # Train category models for each main category
        for category_names in self.category_names_to_category:
            mask = self.df['category_names'] == category_names
            if mask.sum() > 0:
                print(f"\nTraining category model for {category_names}...")
                
                category_model, category_le, category_selector = self.train_classification_model(
                    all_vectors[mask],
                    self.df[mask]['retagged_category']
                )
                
                model_key = f'category_{category_names.replace(" ", "_").replace("/", "_").replace("&", "and")}'
                self.models[model_key] = category_model
                self.label_encoders[model_key] = category_le
                self.selectors[model_key] = category_selector
                
                joblib.dump(category_model, f'{save_path}{model_key}_model.joblib')
                joblib.dump(category_le, f'{save_path}{model_key}_encoder.joblib')
                joblib.dump(category_selector, f'{save_path}{model_key}_selector.joblib')

        # Train sub-category models
        for category in self.category_to_sub_category:
            mask = self.df['retagged_category'] == category
            if mask.sum() > 0 and len(self.df[mask]['sub_category_names'].unique()) > 1:
                print(f"\nTraining sub-category model for {category}...")
                
                sub_category_model, sub_category_le, sub_category_selector = self.train_classification_model(
                    all_vectors[mask],
                    self.df[mask]['sub_category_names']
                )
                
                model_key = f'sub_category_names_{category.replace(" ", "_").replace("/", "_").replace("&", "and")}'
                self.models[model_key] = sub_category_model
                self.label_encoders[model_key] = sub_category_le
                self.selectors[model_key] = sub_category_selector
                
                joblib.dump(sub_category_model, f'{save_path}{model_key}_model.joblib')
                joblib.dump(sub_category_le, f'{save_path}{model_key}_encoder.joblib')
                joblib.dump(sub_category_selector, f'{save_path}{model_key}_selector.joblib')

In [7]:
classifier = CyberCrimeClassifier(df)
classifier.train_all_models()

Vectorizing all texts...

Training main category model...

Initial Class Distribution:
Class financial_fraud_crimes: 50430 samples (58.39%)
Class other_cyber_crime: 33360 samples (38.63%)
Class women_child_related_crime: 2575 samples (2.98%)

Training rf...
Weighted F1-Score: 0.8095

Classification Report:
                           precision    recall  f1-score   support

   financial_fraud_crimes       0.84      0.91      0.88     10086
        other_cyber_crime       0.79      0.74      0.76      6672
women_child_related_crime       0.62      0.06      0.11       515

                 accuracy                           0.82     17273
                macro avg       0.75      0.57      0.58     17273
             weighted avg       0.81      0.82      0.81     17273


Confusion Matrix:
[[9186  899    1]
 [1690 4965   17]
 [  34  451   30]]

Training category model for women_child_related_crime...

Initial Class Distribution:
Class child_pornography_cpchild_sexual_abuse_material_csam: