In [1]:
# Install required libraries
!pip install pandas numpy nltk indic-nlp-library



In [2]:
# Import required libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
import nltk

# NLTK Downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# Load datasets
train_path = "new_train.csv"  # Update path
test_path = "new_test.csv"    # Update path

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display initial data
print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())

[nltk_data] Downloading package punkt to /home/ailab3/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ailab3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ailab3/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/ailab3/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ailab3/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Train Dataset:
                                category                       sub_category  \
0  Online and Social Media Related Crime  Cyber Bullying  Stalking  Sexting   
1                 Online Financial Fraud                  Fraud CallVishing   
2               Online Gambling  Betting           Online Gambling  Betting   
3  Online and Social Media Related Crime                   Online Job Fraud   
4                 Online Financial Fraud                  Fraud CallVishing   

                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                new_category                            new_sub_category  
0  Women/Child Related Crime         ['Cyber Bullying/Stalking/Sexting']  
1     Financial Fraud C

In [3]:
# Drop 'category' and 'sub_category' columns if they exist
for df in [train_df, test_df]:
    df.drop(columns=['category', 'sub_category'], inplace=True, errors='ignore')

print("\nTrain Data After Dropping Categories:")
print(train_df.head())

print("\nTest Data After Dropping Categories:")
print(test_df.head())


Train Data After Dropping Categories:
                                  crimeaditionalinfo  \
0  I had continue received random calls and abusi...   
1  The above fraudster is continuously messaging ...   
2  He is acting like a police and demanding for m...   
3  In apna Job I have applied for job interview f...   
4  I received a call from lady stating that she w...   

                new_category                            new_sub_category  
0  Women/Child Related Crime         ['Cyber Bullying/Stalking/Sexting']  
1     Financial Fraud Crimes    ['Fraud Call/Vishing', 'Email Phishing']  
2          Other Cyber Crime           ['Online Gambling/Betting Fraud']  
3  Women/Child Related Crime  ['Online Job Fraud', 'Fraud Call/Vishing']  
4     Financial Fraud Crimes    ['Fraud Call/Vishing', 'Email Phishing']  

Test Data After Dropping Categories:
                                  crimeaditionalinfo  \
0  Sir namaskar  mein Ranjit Kumar PatraPaise neh...   
1          KOTAK MAHINDR

In [4]:
# Function to transliterate Hinglish to English
def transliterate_to_english(text):
    try:
        if isinstance(text, str):  # Ensure text is a string
            return UnicodeIndicTransliterator.transliterate(text, 'hi', 'en')
        return text
    except Exception as e:
        return text  # Fallback if transliteration fails

In [5]:
# Function to clean text
def clean_text(text):
    if not isinstance(text, str):  # Check if the input is a string
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove phone numbers and numeric strings
    text = re.sub(r'\b\d{10}\b', '', text)  # 10-digit phone numbers
    text = re.sub(r'\d+', '', text)  # Remove standalone numbers
    
    # Remove special characters, punctuation, and excessive whitespace
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [6]:
# Initialize Stopwords and Lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function for stopword removal and lemmatization
def preprocess_text(text):
    try:
        # Tokenize the text
        tokens = word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        
        return " ".join(tokens)
    except Exception as e:
        return text  # Return original text if processing fails

In [7]:
# Apply preprocessing pipeline to 'crimeaditionalinfo' column
for df in [train_df, test_df]:
    if 'crimeaditionalinfo' in df.columns:
        # Transliterate Hinglish/Hindi to English
        df['processed_text'] = df['crimeaditionalinfo'].apply(transliterate_to_english)
        
        # Clean text
        df['processed_text'] = df['processed_text'].apply(clean_text)
        
        # Stopword removal and lemmatization
        df['processed_text'] = df['processed_text'].apply(preprocess_text)

In [8]:
# Tokenize processed text
def tokenize_text(text):
    return word_tokenize(str(text))

for df in [train_df, test_df]:
    if 'processed_text' in df.columns:
        df['tokenized_text'] = df['processed_text'].apply(tokenize_text)

# Display tokenized text
print("\nTokenized Train Dataset:")
print(train_df[['processed_text', 'tokenized_text']].head())

print("\nTokenized Test Dataset:")
print(test_df[['processed_text', 'tokenized_text']].head())


Tokenized Train Dataset:
                                      processed_text  \
0  continue received random call abusive message ...   
1  fraudster continuously messaging asking pay mo...   
2  acting like police demanding money adding sect...   
3  apna job applied job interview telecalling res...   
4  received call lady stating send new phone vivo...   

                                      tokenized_text  
0  [continue, received, random, call, abusive, me...  
1  [fraudster, continuously, messaging, asking, p...  
2  [acting, like, police, demanding, money, addin...  
3  [apna, job, applied, job, interview, telecalli...  
4  [received, call, lady, stating, send, new, pho...  

Tokenized Test Dataset:
                                      processed_text  \
0  sir namaskar mein ranjit kumar patrapaise nehi...   
1             kotak mahindra bank fraud fraud amount   
2  issue actually started got email first glance ...   
3  amit kumar karwi chitrakoot totally depressed ...   
4 

In [9]:
# Save processed datasets
train_df.to_csv("processed_train.csv", index=False)
test_df.to_csv("processed_test.csv", index=False)

print("\nProcessed datasets saved as 'processed_train.csv' and 'processed_test.csv'")


Processed datasets saved as 'processed_train.csv' and 'processed_test.csv'
