## Importing Libraries and Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download the NLTK stemmer data
nltk.download("punkt")
# Download the NLTK lemmatizer data
nltk.download("wordnet")
# Download the stop words
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to /home/mmk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mmk/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mmk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def stem_token(token):
    stemmer = PorterStemmer()
    return stemmer.stem(token)


def lemmatize_token(token):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(token)

In [3]:
df_original = pd.read_csv("Transaction_Naration_Data_Set.csv")

In [4]:
print("Shape: ", df_original.shape)
print("Info:\n", df_original.info())

Shape:  (22546758, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22546758 entries, 0 to 22546757
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   AC_ID       int64  
 2   AMOUNT      float64
 3   DOC_DATE    object 
 4   NARATION    object 
 5   DR_CR       object 
dtypes: float64(1), int64(2), object(3)
memory usage: 1.0+ GB
Info:
 None


In [5]:
# Randomly select 10000 rows from the DataFrame
df = df_original.sample(n=10000)
display(df.head())

Unnamed: 0.1,Unnamed: 0,AC_ID,AMOUNT,DOC_DATE,NARATION,DR_CR
14430320,14430320,261502,50000.0,20-FEB-23,Fund Transfer,Credit
6521172,6521172,1012031,75000.0,26-DEC-22,Cash Withdraw,Debit
7762946,7762946,465981,723.0,01-JAN-23,Cash Deposit to A/C No.:,Debit
14816907,14816907,3354419,20000.0,25-FEB-23,Debit Against Debit Card Transaction,Debit
8353357,8353357,81222,1000.0,04-JAN-23,Cash Deposit to A/C No.:,Debit


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 14430320 to 11772679
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  10000 non-null  int64  
 1   AC_ID       10000 non-null  int64  
 2   AMOUNT      10000 non-null  float64
 3   DOC_DATE    10000 non-null  object 
 4   NARATION    10000 non-null  object 
 5   DR_CR       10000 non-null  object 
dtypes: float64(1), int64(2), object(3)
memory usage: 546.9+ KB


In [7]:
# drop the index column
df = df.drop(columns=["Unnamed: 0"])

In [8]:
# converting DOC_DATE to datetime object
df["DOC_DATE"] = pd.to_datetime(df["DOC_DATE"])

# converting NARATION and DR_CR to string
df["NARATION"] = df["NARATION"].astype(str)
df["DR_CR"] = df["DR_CR"].astype(str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 14430320 to 11772679
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   AC_ID     10000 non-null  int64         
 1   AMOUNT    10000 non-null  float64       
 2   DOC_DATE  10000 non-null  datetime64[ns]
 3   NARATION  10000 non-null  object        
 4   DR_CR     10000 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 468.8+ KB


In [10]:
df.NARATION.value_counts()

Credit Against Merchant Cash Withdraw Service From A/C-                                                        1055
Debit Against Palli Bidyut Bill Collection.                                                                    1037
Cash withdraw from Micro Merchant point, MM A/C                                                                1028
Cash Withdrawal From A/C No.:                                                                                  1019
Cash Withdraw                                                                                                   866
                                                                                                               ... 
PARTIAL  LOAN INSTALLMENT REALIZED                                                                                1
Hazera-                                                                                                           1
RLP RD PHASE Loan disbursement from A/C  Title: RLP RD PHASE LOAN FUND T

In [11]:
df.DR_CR.value_counts()

Debit     6097
Credit    3903
Name: DR_CR, dtype: int64

## Text Preprocessing

In [12]:
# Lowercasing
df["NARATION"] = df["NARATION"].str.lower()

In [13]:
# Removing special characters
df["NARATION"] = df["NARATION"].str.replace(r"[^a-zA-Z\s]", "", regex=True)

In [14]:
# Removing punctuations
df["NARATION"] = df["NARATION"].str.replace(r"[^\w\s]", "", regex=True)

In [15]:
# Tokenization on cleaned text
df["NARATION_tokens"] = df["NARATION"].str.split()

In [16]:
# Stemming
df["stemmed_tokens"] = df["NARATION_tokens"].apply(
    lambda tokens: [stem_token(token) for token in tokens]
)

In [17]:
# Lemmatization
df["lemmatized_tokens"] = df["NARATION_tokens"].apply(
    lambda tokens: [lemmatize_token(token) for token in tokens]
)

In [18]:
# Removing stop words from the 'stemmed_tokens' column
df["stemmed_tokens"] = df["stemmed_tokens"].apply(
    lambda tokens: [token for token in tokens if token not in stop_words]
)

In [19]:
# Removing stop words from the 'lemmatized_tokens' column
df["lemmatized_tokens"] = df["lemmatized_tokens"].apply(
    lambda tokens: [token for token in tokens if token not in stop_words]
)

In [20]:
df.head()

Unnamed: 0,AC_ID,AMOUNT,DOC_DATE,NARATION,DR_CR,NARATION_tokens,stemmed_tokens,lemmatized_tokens
14430320,261502,50000.0,2023-02-20,fund transfer,Credit,"[fund, transfer]","[fund, transfer]","[fund, transfer]"
6521172,1012031,75000.0,2022-12-26,cash withdraw,Debit,"[cash, withdraw]","[cash, withdraw]","[cash, withdraw]"
7762946,465981,723.0,2023-01-01,cash deposit to ac no,Debit,"[cash, deposit, to, ac, no]","[cash, deposit, ac]","[cash, deposit, ac]"
14816907,3354419,20000.0,2023-02-25,debit against debit card transaction,Debit,"[debit, against, debit, card, transaction]","[debit, debit, card, transact]","[debit, debit, card, transaction]"
8353357,81222,1000.0,2023-01-04,cash deposit to ac no,Debit,"[cash, deposit, to, ac, no]","[cash, deposit, ac]","[cash, deposit, ac]"
