In [1]:
import re
import string
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.base import TransformerMixin

Load datasets:

In [2]:
train_dataset = pd.read_csv("../aux_data/mapped_dataset.csv")
print(len(train_dataset))
train_dataset.head()

9372


Unnamed: 0,text,date,category,language,class
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,visa_or_mastercard,en,card
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,reverted_card_payment?,en,card
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,extra_charge_on_statement,en,others
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,transfer_timing,en,transfer
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,transfer_timing,en,transfer


In [3]:
train_dataset = train_dataset.dropna()
print(len(train_dataset))

9338


In [4]:
test_dataset = pd.read_csv("../data/test_dataset.csv")
test_dataset.head()

Unnamed: 0,text
0,How do I locate my card?
1,Why won't my card show up on the app?
2,I need to know your exchange rates.
3,I purchased something in a foreign currency bu...
4,My statement has a dollar I have been charged ...


# 1. Filtering 

In [5]:
train_dataset = train_dataset[(train_dataset["language"] == "en")].drop(["category"], axis=1)
print(len(train_dataset))
train_dataset

9038


Unnamed: 0,text,date,language,class
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,en,card
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,en,card
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,en,others
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,en,transfer
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,en,transfer
...,...,...,...,...
9367,"good afternoon, I think someone may be using m...",22-6-2022,en,card
9368,"good morning, Help, I need to top up my accoun...",7-4-2022,en,cash
9369,"hi, I made an international purchasee, but the...",7-12-2022,en,card
9370,"hi, Why is my card not working anymore? Thanks",1-11-2022,en,card


# 2. Text Processing

## Text preprocessing

In [6]:
# Custom transformer to implement sentence cleaning and normalization
class TextCleanerTransformer(TransformerMixin):
    def __init__(self, tokenizer, stopwords, stemmer, regex_list, lower=True):
        self.regex_list = regex_list
        self.tokenizer = tokenizer
        self.stopwords = stopwords
        self.lower = lower
        self.stemmer = stemmer
        
    def transform(self, df, *_):
        X = list(map(self._clean_text, df))
        return X
    
    def _clean_text(self, text):
    
        # lowercase
        if self.lower:
            text = text.lower()

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Replace given regexes
        for regex in regex_list:
            text = re.sub(regex, "", text) 

        # Split sentence into list of words
        words = tokenizer.tokenize(text)

        words_ = []
        for word in words:        
            pattern = re.findall(r"(.)\1{2,}", word)

            # Remove more than 3 equal characters
            if pattern != []:
                for i in pattern:
                    word = re.sub(i+"{2,}", i+i, word)

            # Remove stopwords
            if word not in stopwords:
                # stemming
                words_.append(stemmer.stem(word))

        # Join list elements into string
        text = " ".join(words_)
                
        return text
    
    def fit(self, *_):
        return self

In [7]:
tokenizer = WordPunctTokenizer()
stopwords = stopwords.words('english')
stemmer = SnowballStemmer(language='english')

regex_list = ["hello", "hi", "good morning", "good afternoon", "good evening", "thanks", "best regards", "kind regards"]

def process_text(df, tokenizer, stopwords, stemmer, regex_list):
    cleaner = TextCleanerTransformer(tokenizer, regex_list, stopwords, stemmer)
    
    df_new = df.copy()
    df_new['clean_text'] = cleaner.transform(df_new.text)
    return df_new

In [8]:
clean_train_dataset = process_text(train_dataset, tokenizer, stopwords, stemmer, regex_list)
clean_train_dataset.head()

Unnamed: 0,text,date,language,class,clean_text
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,en,card,matter iff use visa mastercard
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,en,card,got refund purchas two week ago
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,en,others,got bill ann extra pound
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,en,transfer,long take transfer show account
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,en,transfer,use money sent accountt


In [10]:
clean_test_dataset = process_text(test_dataset, tokenizer, stopwords, stemmer, regex_list)
clean_test_dataset.head()

Unnamed: 0,text,clean_text
0,How do I locate my card?,locat card
1,Why won't my card show up on the app?,wont card show app
2,I need to know your exchange rates.,need know exchang rate
3,I purchased something in a foreign currency bu...,purchas sometng foreign currenc rate appli wrong
4,My statement has a dollar I have been charged ...,statement dollar charg show


## Adding extra features

In [12]:
characters_count = lambda text: len(text)
words_count = lambda text: len(tokenizer.tokenize(text))
unique_words_count = lambda text: len(set(tokenizer.tokenize(text)))

# add characters columns
clean_train_dataset["characters_count"] = list(map(characters_count, clean_train_dataset["text"]))
clean_train_dataset["clean_characters_count"] = list(map(characters_count, clean_train_dataset["clean_text"]))

# add word columns
clean_train_dataset["words_count"] = list(map(words_count, clean_train_dataset["text"]))
clean_train_dataset["clean_words_count"] = list(map(words_count, clean_train_dataset["clean_text"]))

clean_train_dataset["unique_words_count"] = list(map(unique_words_count, clean_train_dataset["text"]))
clean_train_dataset["clean_unique_words_count"] = list(map(unique_words_count, clean_train_dataset["clean_text"]))

clean_train_dataset

Unnamed: 0,text,date,language,class,clean_text,characters_count,clean_characters_count,words_count,clean_words_count,unique_words_count,clean_unique_words_count
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,en,card,matter iff use visa mastercard,58,30,13,5,13,5
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,en,card,got refund purchas two week ago,83,31,16,6,16,6
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,en,others,got bill ann extra pound,43,24,10,5,10,5
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,en,transfer,long take transfer show account,73,31,18,5,18,5
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,en,transfer,use money sent accountt,58,23,14,4,14,4
...,...,...,...,...,...,...,...,...,...,...,...
9367,"good afternoon, I think someone may be using m...",22-6-2022,en,card,tnk someon may use card,66,23,14,5,14,5
9368,"good morning, Help, I need to top up my accoun...",7-4-2022,en,cash,help need top account send check,86,32,22,6,20,6
9369,"hi, I made an international purchasee, but the...",7-12-2022,en,card,made intern purchase exchang rate wrong,77,39,15,6,14,6
9370,"hi, Why is my card not working anymore? Thanks",1-11-2022,en,card,card work anymor,46,16,11,3,11,3


In [13]:
def mean(df_train, columns_list):
    df = pd.DataFrame()

    df["class"] = df_train["class"].unique()
    df["rows"] = [count for count in df_train.groupby(['class'])["text"].count()]
    
    for i in columns_list:
        df[i] = [valeu for valeu in df_train.groupby(['class'])[i].sum()]
        df[i+"_mean"] = [int(df[i][l]/df["rows"][l]) for l in range(len(df))]
        
        df = df.drop(i, axis=1)
    
    return df

In [14]:
# new feature mean analysis: check if is better to keep the values based on text or clean_text column
columns_list = ["characters_count",  "words_count", "unique_words_count", "clean_characters_count",
                "clean_words_count", "clean_unique_words_count"]

mean = mean(clean_train_dataset, columns_list)
mean

Unnamed: 0,class,rows,characters_count_mean,words_count_mean,unique_words_count_mean,clean_characters_count_mean,clean_words_count_mean,clean_unique_words_count_mean
0,card,2966,76,16,16,29,5,5
1,others,1164,94,20,19,40,7,6
2,transfer,2982,79,17,16,31,5,5
3,cash,625,68,15,14,25,4,4
4,security,1301,97,20,19,42,7,6


In [15]:
# drop columns based on clean_text column
clean_train_dataset = clean_train_dataset.drop(["clean_words_count", "clean_characters_count", "clean_unique_words_count"], axis=1)
clean_train_dataset

Unnamed: 0,text,date,language,class,clean_text,characters_count,words_count,unique_words_count
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,en,card,matter iff use visa mastercard,58,13,13
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,en,card,got refund purchas two week ago,83,16,16
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,en,others,got bill ann extra pound,43,10,10
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,en,transfer,long take transfer show account,73,18,18
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,en,transfer,use money sent accountt,58,14,14
...,...,...,...,...,...,...,...,...
9367,"good afternoon, I think someone may be using m...",22-6-2022,en,card,tnk someon may use card,66,14,14
9368,"good morning, Help, I need to top up my accoun...",7-4-2022,en,cash,help need top account send check,86,22,20
9369,"hi, I made an international purchasee, but the...",7-12-2022,en,card,made intern purchase exchang rate wrong,77,15,14
9370,"hi, Why is my card not working anymore? Thanks",1-11-2022,en,card,card work anymor,46,11,11


In [17]:
clean_train_dataset.to_csv("../aux_data/train_dataset_preprocessed.csv", index=False)

In [18]:
# add characters columns
clean_test_dataset["characters_count"] = list(map(characters_count, clean_test_dataset["text"]))

# add word columns
clean_test_dataset["words_count"] = list(map(words_count, clean_test_dataset["text"]))
clean_test_dataset["unique_words_count"] = list(map(unique_words_count, clean_test_dataset["text"]))

In [19]:
clean_test_dataset.to_csv("../aux_data/test_dataset_preprocessed.csv", index=False)
clean_test_dataset

Unnamed: 0,text,clean_text,characters_count,words_count,unique_words_count
0,How do I locate my card?,locat card,24,7,7
1,Why won't my card show up on the app?,wont card show app,37,12,12
2,I need to know your exchange rates.,need know exchang rate,35,8,8
3,I purchased something in a foreign currency bu...,purchas sometng foreign currenc rate appli wrong,73,13,13
4,My statement has a dollar I have been charged ...,statement dollar charg show,63,14,14
...,...,...,...,...,...
1037,Can I use my card in Austria?,use card austria,29,8,8
1038,Do you have to be in the UK to get a card?,uk get card,42,13,12
1039,Can I use this all over the world?,use ts world,34,9,9
1040,Can I get a card even though I live in the US,get card even though live us,45,12,11
