In [1]:
import pandas as pd

# Load the train and test datasets
train_path = "datasets/train.csv"  # Update with actual path to train.csv
test_path = "datasets/test.csv"    # Update with actual path to test.csv

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display the first few rows and shape of the datasets
print("Train Dataset Head:")
print(train_df.head())
print("\nTrain Dataset Shape:", train_df.shape)

print("\nTest Dataset Head:")
print(test_df.head())
print("\nTest Dataset Shape:", test_df.shape)

Train Dataset Head:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  
0  I had continue received random calls and abusi...  
1  The above fraudster is continuously messaging ...  
2  He is acting like a police and demanding for m...  
3  In apna Job I have applied for job interview f...  
4  I received a call from lady stating that she w...  

Train Dataset Shape: (93686, 3)

Test Dataset Head:
                                    category  \
0  RapeGang Rape RGRSexually Abusive Content   
1                     Onli

In [2]:
# Manual mapping of existing categories to new categories
category_mapping = {
    'Online and Social Media Related Crime': 'Women/Child Related Crime',
    'Online Financial Fraud': 'Financial Fraud Crimes',
    'Online Gambling  Betting': 'Other Cyber Crime',
    'RapeGang Rape RGRSexually Abusive Content': 'Women/Child Related Crime',
    'Any Other Cyber Crime': 'Other Cyber Crime',
    'Cyber Attack/ Dependent Crimes': 'Other Cyber Crime',
    'Cryptocurrency Crime': 'Other Cyber Crime',
    'Sexually Explicit Act': 'Women/Child Related Crime',
    'Sexually Obscene material': 'Women/Child Related Crime',
    'Hacking  Damage to computercomputer system etc': 'Other Cyber Crime',
    'Cyber Terrorism': 'Other Cyber Crime',
    'Child Pornography CPChild Sexual Abuse Material CSAM': 'Women/Child Related Crime',
    'Online Cyber Trafficking': 'Women/Child Related Crime',
    'Ransomware': 'Other Cyber Crime',
    'Report Unlawful Content': 'Other Cyber Crime'
}

# Apply the category mapping to the 'category' column in both train and test datasets
for df in [train_df, test_df]:
    if 'category' in df.columns:
        df['new_category'] = df['category'].apply(lambda x: category_mapping.get(x, 'Other Cyber Crime'))

# Verify the results of the category mapping
print("\nTrain Dataset with New Categories:")
print(train_df[['category', 'new_category']].head())

print("\nTest Dataset with New Categories:")
print(test_df[['category', 'new_category']].head())


Train Dataset with New Categories:
                                category               new_category
0  Online and Social Media Related Crime  Women/Child Related Crime
1                 Online Financial Fraud     Financial Fraud Crimes
2               Online Gambling  Betting          Other Cyber Crime
3  Online and Social Media Related Crime  Women/Child Related Crime
4                 Online Financial Fraud     Financial Fraud Crimes

Test Dataset with New Categories:
                                    category               new_category
0  RapeGang Rape RGRSexually Abusive Content  Women/Child Related Crime
1                     Online Financial Fraud     Financial Fraud Crimes
2             Cyber Attack/ Dependent Crimes          Other Cyber Crime
3                     Online Financial Fraud     Financial Fraud Crimes
4                      Any Other Cyber Crime          Other Cyber Crime


In [3]:
# Manual mapping of existing subcategories to new subcategories
subcategory_mapping = {
    'Cyber Bullying  Stalking  Sexting': ['Cyber Bullying/Stalking/Sexting'],
    'Fraud CallVishing': ['Fraud Call/Vishing', 'Email Phishing'],
    'Online Gambling  Betting': ['Online Gambling/Betting Fraud'],
    'Online Job Fraud': ['Online Job Fraud', 'Fraud Call/Vishing'],
    'UPI Related Frauds': ['UPI-Related Frauds', 'Financial Fraud Crimes'],
    'Internet Banking Related Fraud': ['Internet Banking-Related Fraud', 'Financial Fraud Crimes'],
    'Profile Hacking Identity Theft': ['Profile Hacking/Identity Theft', 'Cyber Bullying/Stalking/Sexting'],
    'DebitCredit Card FraudSim Swap Fraud': ['Debit/Credit Card Fraud', 'SIM Swap Fraud'],
    'EWallet Related Fraud': ['E-Wallet Related Frauds', 'Financial Fraud Crimes'],
    'Data Breach/Theft': ['Unauthorized Access/Data Breach', 'Data Breaches'],
    'Cheating by Impersonation': ['Cheating by Impersonation', 'Impersonating Email'],
    'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': ['Denial of Service (DoS) and Distributed Denial of Service (DDoS) attacks'],
    'FakeImpersonating Profile': ['Fake/Impersonating Profile', 'Fraud Call/Vishing'],
    'Cryptocurrency Fraud': ['Cryptocurrency Crime'],
    'Malware Attack': ['Malware attacks', 'Hacking'],
    'Business Email CompromiseEmail Takeover': ['Business Email Compromise/Email Takeover'],
    'Email Hacking': ['Email Hacking', 'Profile Hacking/Identity Theft'],
    'Hacking/Defacement': ['Defacement/Hacking', 'Hacking'],
    'Unauthorized AccessData Breach': ['Unauthorized Access/Data Breach', 'Data Breaches'],
    'SQL Injection': ['Web application vulnerabilities'],
    'Provocative Speech for unlawful acts': ['Provocative Speech of Unlawful Acts'],
    'Ransomware Attack': ['Ransomware'],
    'Cyber Terrorism': ['Cyber Terrorism', 'Any Other Cyber Crime'],
    'Tampering with computer source documents': ['Tampering with Computer Source Documents'],
    'DematDepository Fraud': ['Demat/Depository Fraud'],
    'Online Trafficking': ['Online Cyber Trafficking'],
    'Online Matrimonial Fraud': ['Online Matrimonial Fraud'],
    'Website DefacementHacking': ['Defacement/Hacking'],
    'Damage to computer computer systems etc': ['Damage to Computer Systems'],
    'Impersonating Email': ['Impersonating Email', 'Email Phishing'],
    'EMail Phishing': ['Email Phishing'],
    'Ransomware': ['Ransomware'],
    'Intimidating Email': ['Intimidating Email'],
    'Against Interest of sovereignty or integrity of India': ['Cyber Terrorism']
}

# Function to map subcategories based on both existing subcategory and category
def map_to_subcategory(existing_subcategory, category=None):
    # First try to map from subcategory mapping
    subcategories = subcategory_mapping.get(existing_subcategory, [])
    
    # If no mapping found, try to map based on category as a fallback
    if not subcategories and category:
        if 'Financial' in category:
            subcategories.append('Financial Fraud Crimes')
        elif 'Women' in category or 'Child' in category:
            subcategories.append('Women/Child Related Crime')
        elif 'Cyber' in category:
            subcategories.append('Other Cyber Crime')
    
    return subcategories

# Apply the subcategory mapping based on both subcategory and category labels
for df in [train_df, test_df]:
    if 'sub_category' in df.columns and 'new_category' in df.columns:
        df['new_sub_category'] = df.apply(lambda row: map_to_subcategory(row['sub_category'], row['new_category']), axis=1)

# Verify the results of the subcategory mapping
print("\nTrain Dataset with New Subcategories:")
print(train_df[['sub_category', 'new_sub_category']].head())

print("\nTest Dataset with New Subcategories:")
print(test_df[['sub_category', 'new_sub_category']].head())


Train Dataset with New Subcategories:
                        sub_category                        new_sub_category
0  Cyber Bullying  Stalking  Sexting       [Cyber Bullying/Stalking/Sexting]
1                  Fraud CallVishing    [Fraud Call/Vishing, Email Phishing]
2           Online Gambling  Betting         [Online Gambling/Betting Fraud]
3                   Online Job Fraud  [Online Job Fraud, Fraud Call/Vishing]
4                  Fraud CallVishing    [Fraud Call/Vishing, Email Phishing]

Test Dataset with New Subcategories:
                           sub_category  \
0                                   NaN   
1  DebitCredit Card FraudSim Swap Fraud   
2                         SQL Injection   
3                     Fraud CallVishing   
4                                 Other   

                            new_sub_category  
0                [Women/Child Related Crime]  
1  [Debit/Credit Card Fraud, SIM Swap Fraud]  
2          [Web application vulnerabilities]  
3       [Fraud

In [4]:
# Clean the data by removing rows where 'new_sub_category' is an empty list
train_df = train_df[train_df['new_sub_category'].apply(lambda x: len(x) > 0)]
test_df = test_df[test_df['new_sub_category'].apply(lambda x: len(x) > 0)]

# Verify the shape of the cleaned data
print("\nTrain Dataset Shape after Cleaning:", train_df.shape)
print("\nTest Dataset Shape after Cleaning:", test_df.shape)


Train Dataset Shape after Cleaning: (93686, 5)

Test Dataset Shape after Cleaning: (31229, 5)


In [5]:
# Save the final mapped datasets to CSV
train_df.to_csv("new_train.csv", index=False)
test_df.to_csv("new_test.csv", index=False)

print("Final datasets saved!")

Final datasets saved!
