In [2]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import re
import hashlib
import numpy as np
import spacy

from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm
from spacy.matcher import Matcher
from sklearn.metrics import accuracy_score, f1_score 
from nltk.tokenize import WordPunctTokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
data_set = pd.read_csv('data/dataset.csv')
data_set.head()

Unnamed: 0,text,date,category,language
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,visa_or_mastercard,en
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,reverted_card_payment?,en
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,extra_charge_on_statement,en
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,transfer_timing,en
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,transfer_timing,en


In [4]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9375 entries, 0 to 9374
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      9373 non-null   object
 1   date      9373 non-null   object
 2   category  9373 non-null   object
 3   language  9373 non-null   object
dtypes: object(4)
memory usage: 293.1+ KB


In [5]:
data_set.language.value_counts(normalize=True)

en    0.967993
pt    0.032007
Name: language, dtype: float64

In [6]:
maping = pd.read_csv('data/mapping.csv')
maping.head()

Unnamed: 0,category,class
0,card_arrival,card
1,card_linking,card
2,exchange_rate,others
3,card_payment_wrong_exchange_rate,card
4,extra_charge_on_statement,others


In [7]:
maping['class'].value_counts()

card        27
others      26
transfer    10
cash         8
security     6
Name: class, dtype: int64

In [8]:
maping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  77 non-null     object
 1   class     77 non-null     object
dtypes: object(2)
memory usage: 1.3+ KB


In [9]:
df = pd.merge(left=data_set, right=maping, on='category', how='left')
df

Unnamed: 0,text,date,category,language,class
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,visa_or_mastercard,en,card
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,reverted_card_payment?,en,card
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,extra_charge_on_statement,en,others
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,transfer_timing,en,transfer
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,transfer_timing,en,transfer
...,...,...,...,...,...
9370,"good afternoon, I think someone may be using m...",22-6-2022,compromised_card,en,card
9371,"good morning, Help, I need to top up my accoun...",7-4-2022,top_up_by_cash_or_cheque,en,cash
9372,"hi, I made an international purchasee, but the...",7-12-2022,card_payment_wrong_exchange_rate,en,card
9373,"hi, Why is my card not working anymore? Thanks",1-11-2022,card_not_working,en,card


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9375 entries, 0 to 9374
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      9373 non-null   object
 1   date      9373 non-null   object
 2   category  9373 non-null   object
 3   language  9373 non-null   object
 4   class     9338 non-null   object
dtypes: object(5)
memory usage: 439.5+ KB


In [11]:
df.isnull().sum()

text         2
date         2
category     2
language     2
class       37
dtype: int64

In [12]:
df[df['class'].isnull()]

Unnamed: 0,text,date,category,language,class
24,We have hope. Rebellions are built on hope.,2-2-2022,account_support,en,
429,,,,,
1155,"I'm one with the Force, and the Force will gui...",2-2-2022,account_support,en,
1305,Lesson one from this experience: Realize how m...,2-2-2022,account_support,en,
1459,The belonging you seek is not behind you; it i...,2-2-2022,account_support,en,
2023,I am one with the Force. The Force is with me.,2-2-2022,account_support,en,
2107,Try not. Do or do not. There is no try.,2-2-2022,account_support,en,
2176,That's not how the Force works!,2-2-2022,account_support,en,
2222,No one’s ever really gone.,2-2-2022,account_support,en,
2412,Never tell me the odds!,2-2-2022,account_support,en,


In [13]:
df_clean = df[df['class'].notna()]

In [14]:
df_clean = df_clean[df_clean['language'] != 'pt'].copy()
df_clean

Unnamed: 0,text,date,category,language,class
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,visa_or_mastercard,en,card
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,reverted_card_payment?,en,card
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,extra_charge_on_statement,en,others
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,transfer_timing,en,transfer
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,transfer_timing,en,transfer
...,...,...,...,...,...
9370,"good afternoon, I think someone may be using m...",22-6-2022,compromised_card,en,card
9371,"good morning, Help, I need to top up my accoun...",7-4-2022,top_up_by_cash_or_cheque,en,cash
9372,"hi, I made an international purchasee, but the...",7-12-2022,card_payment_wrong_exchange_rate,en,card
9373,"hi, Why is my card not working anymore? Thanks",1-11-2022,card_not_working,en,card


In [18]:
data = df_clean[['text', 'class']]
data

Unnamed: 0,text,class
0,"Hello, Does it matter iff I use Visa or Master...",card
1,"Good afternoon, I just got refunded for my pur...",card
2,"Hello, I got billed ann extra pound! Thanks",others
3,"Hi, How long does it take for a transfer to sh...",transfer
4,"hi, When can I use money sent to my accountt? ...",transfer
...,...,...
9370,"good afternoon, I think someone may be using m...",card
9371,"good morning, Help, I need to top up my accoun...",cash
9372,"hi, I made an international purchasee, but the...",card
9373,"hi, Why is my card not working anymore? Thanks",card


In [19]:
data.to_csv('data/data.csv', index=False)