In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,category,sub_category,crimeaditionalinfo
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...
...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,Identity theft Smishing SMS Fraud CreditDeb...
93682,Online Financial Fraud,EWallet Related Fraud,RECEIVED CALL FROM NUMBER ASKING ABOUT phone ...
93683,Online Financial Fraud,UPI Related Frauds,Cyber Stalking Blackmailing PhoneSMSVOIP C...
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,Call karke bola ki aapka lotary laga ha aru AC...


In [3]:
df = df.dropna(subset=['crimeaditionalinfo'])


In [4]:
print(df.shape)

(93665, 3)


In [5]:
df['sub_category'].isnull().sum()

np.int64(6591)

In [6]:
# Fill null values in the 'sub_category' column with 'Unknown'
df['sub_category'].fillna('Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sub_category'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sub_category'].fillna('Unknown', inplace=True)


In [7]:
df['clean_text'] = df['crimeaditionalinfo'].str.lower()
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['crimeaditionalinfo'].str.lower()


Unnamed: 0,category,sub_category,crimeaditionalinfo,clean_text
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,i had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,the above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,he is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,in apna job i have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,i received a call from lady stating that she w...


In [8]:
def remove_punctuation(text):
    punctions = string.punctuation
    return text.translate(str.maketrans('','',punctions))

In [9]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuation(x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuation(x))


Unnamed: 0,category,sub_category,crimeaditionalinfo,clean_text
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,i had continue received random calls and abusi...
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,the above fraudster is continuously messaging ...
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,he is acting like a police and demanding for m...
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,in apna job i have applied for job interview f...
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,i received a call from lady stating that she w...


#Preprocess

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)  # Additional resource for WordNet

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Tokenize, remove stop words, and lemmatize
def preprocess_text(text):
    wordnet_map = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'J': wordnet.ADJ, 'R': wordnet.ADV}
    tokens = word_tokenize(text)
    pos_text = pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
              for word, pos in pos_text if word.lower() not in stop_words]
    return tokens

# Apply to the 'crimeaditionalinfo' column
#df['processed_info'] = df['crimeaditionalinfo'].apply(preprocess_text)



In [11]:
df['processed_info'] = df['clean_text'].apply(lambda x: preprocess_text(x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_info'] = df['clean_text'].apply(lambda x: preprocess_text(x))


Unnamed: 0,category,sub_category,crimeaditionalinfo,clean_text,processed_info
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,i had continue received random calls and abusi...,"[continue, receive, random, call, abusive, mes..."
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,the above fraudster is continuously messaging ...,"[fraudster, continuously, message, ask, pay, m..."
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,he is acting like a police and demanding for m...,"[act, like, police, demanding, money, add, sec..."
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,in apna job i have applied for job interview f...,"[apna, job, apply, job, interview, telecalling..."
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,i received a call from lady stating that she w...,"[receive, call, lady, state, send, new, phone,..."


In [12]:
df

Unnamed: 0,category,sub_category,crimeaditionalinfo,clean_text,processed_info
0,Online and Social Media Related Crime,Cyber Bullying Stalking Sexting,I had continue received random calls and abusi...,i had continue received random calls and abusi...,"[continue, receive, random, call, abusive, mes..."
1,Online Financial Fraud,Fraud CallVishing,The above fraudster is continuously messaging ...,the above fraudster is continuously messaging ...,"[fraudster, continuously, message, ask, pay, m..."
2,Online Gambling Betting,Online Gambling Betting,He is acting like a police and demanding for m...,he is acting like a police and demanding for m...,"[act, like, police, demanding, money, add, sec..."
3,Online and Social Media Related Crime,Online Job Fraud,In apna Job I have applied for job interview f...,in apna job i have applied for job interview f...,"[apna, job, apply, job, interview, telecalling..."
4,Online Financial Fraud,Fraud CallVishing,I received a call from lady stating that she w...,i received a call from lady stating that she w...,"[receive, call, lady, state, send, new, phone,..."
...,...,...,...,...,...
93681,Online Financial Fraud,Internet Banking Related Fraud,Identity theft Smishing SMS Fraud CreditDeb...,identity theft smishing sms fraud creditdeb...,"[identity, theft, smishing, sms, fraud, credit..."
93682,Online Financial Fraud,EWallet Related Fraud,RECEIVED CALL FROM NUMBER ASKING ABOUT phone ...,received call from number asking about phone ...,"[receive, call, number, ask, phone, pay, cash,..."
93683,Online Financial Fraud,UPI Related Frauds,Cyber Stalking Blackmailing PhoneSMSVOIP C...,cyber stalking blackmailing phonesmsvoip c...,"[cyber, stalk, blackmail, phonesmsvoip, call, ..."
93684,Online and Social Media Related Crime,Online Matrimonial Fraud,Call karke bola ki aapka lotary laga ha aru AC...,call karke bola ki aapka lotary laga ha aru ac...,"[call, karke, bola, ki, aapka, lotary, laga, h..."


In [13]:
df = df[df['processed_info'].astype(bool)]

In [21]:
df1=df.drop(columns=['crimeaditionalinfo','clean_text'])

In [22]:
df1.to_csv('D:\Hackathon\docs\processed\pre_train.csv')

  df1.to_csv('D:\Hackathon\docs\processed\pre_train.csv')


category              0
sub_category          0
crimeaditionalinfo    0
clean_text            0
processed_info        0
dtype: int64