In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/rapidorc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rapidorc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rapidorc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df_train=pd.read_csv("train.csv")

# FUNCTION DEFINITION

In [7]:

def preprocess_text(text):
    # Convert to string in case of non-string input
    text = str(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into text
    return ' '.join(tokens)

# MAPPING DICTIONARY

In [8]:
category_mapping = {
    'Online Financial Fraud': 0,
    'Online and Social Media Related Crime': 1,
    'Any Other Cyber Crime': 2,
    'Cyber Attack/ Dependent Crimes': 3,
    'RapeGang Rape RGRSexually Abusive Content': 4,
    'Sexually Obscene material': 5,
    'Hacking  Damage to computercomputer system etc': 6,
    'Sexually Explicit Act': 7,
    'Cryptocurrency Crime': 8,
    'Online Gambling  Betting': 9,
    'Child Pornography CPChild Sexual Abuse Material CSAM': 10,
    'Online Cyber Trafficking': 11,
    'Cyber Terrorism': 12,
    'Ransomware': 13,
    'Report Unlawful Content': 14
}

subcategory_mapping = {
    'UPI Related Frauds': 0,
    'Other': 1,
    'DebitCredit Card FraudSim Swap Fraud': 2,
    'Internet Banking Related Fraud': 3,
    'Fraud CallVishing': 4,
    'Cyber Bullying  Stalking  Sexting': 5,
    'EWallet Related Fraud': 6,
    'FakeImpersonating Profile': 7,
    'Profile Hacking Identity Theft': 8,
    'Cheating by Impersonation': 9,
    'Unauthorised AccessData Breach': 10,
    'Online Job Fraud': 11,
    'DematDepository Fraud': 12,
    'Tampering with computer source documents': 13,
    'Hacking/Defacement': 14,
    'Ransomware Attack': 15,
    'Malware Attack': 16,
    'SQL Injection': 17,
    'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks': 18,
    'Data Breach/Theft': 19,
    'Cryptocurrency Fraud': 20,
    'Online Gambling  Betting': 21,
    'Provocative Speech for unlawful acts': 22,
    'Email Hacking': 23,
    'Business Email CompromiseEmail Takeover': 24,
    'Online Trafficking': 25,
    'Cyber Terrorism': 26,
    'EMail Phishing': 27,
    'Online Matrimonial Fraud': 28,
    'Damage to computer computer systems etc': 29,
    'Website DefacementHacking': 30,
    'Ransomware': 31,
    'Impersonating Email': 32,
    'Intimidating Email': 33,
    'Against Interest of sovereignty or integrity of India': 34
}


# PROCESSING

In [9]:
df=pd.read_csv("train.csv")

In [10]:
df['processed_text'] = df['crimeaditionalinfo'].apply(preprocess_text)

df['category_encoded'] = df['category'].map(category_mapping)
df['subcategory_encoded'] = df['sub_category'].map(subcategory_mapping)

df.to_csv('Modified_Data.csv', index=False)