In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin, clone

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, precision_score, recall_score

### Load dataset:

In [2]:
train_dataset = pd.read_csv("../aux_data/train_dataset_preprocessed-1.csv")
train_dataset.head()

Unnamed: 0,text,date,language,class,clean_text,characters_count,words_count,unique_words_count
0,"Hello, Does it matter iff I use Visa or Master...",7-6-2022,en,card,matter iff use visa mastercard,58,13,13
1,"Good afternoon, I just got refunded for my pur...",16-11-2022,en,card,got refund purchas two week ago,83,16,16
2,"Hello, I got billed ann extra pound! Thanks",4-12-2022,en,others,got bill ann extra pound,43,10,10
3,"Hi, How long does it take for a transfer to sh...",23-11-2022,en,transfer,long take transfer show account,73,18,18
4,"hi, When can I use money sent to my accountt? ...",17-4-2022,en,transfer,use money sent accountt,58,14,14


In [3]:
print(len(train_dataset))

9038


In [4]:
print(len(train_dataset.dropna()))

9037


In [5]:
train_dataset = train_dataset.dropna()

### Spliting Data: train/validation

In [6]:
# train and validation data
train, val = train_test_split(train_dataset, test_size=0.3, random_state=42)

In [7]:
train

Unnamed: 0,text,date,language,class,clean_text,characters_count,words_count,unique_words_count
322,"Hi, Why is the exchange rate so high? This sho...",18-11-2022,en,cash,exchang rate gh ts beenn much gher amount cash...,115,26,26
2032,"good morning, What is the closest ATM? Thanks",19-3-2022,en,others,closest atm,45,10,10
994,"Good afternoon, Tell myy why myy top-up is pen...",8-11-2022,en,others,tell myy myy topup pend,58,14,13
8928,"Good afternoon, I think something is wrong wit...",30-7-2022,en,others,tnk sometng wrong top ive beenn wait littl hou...,171,39,35
7621,"hi, top up withh credit card Kind Regards",18-12-2022,en,card,top withh credit card,41,9,9
...,...,...,...,...,...,...,...,...
5735,"hello, AA seller is stating that they haven't ...",5-3-2022,en,card,aa seller state havent receiv money yet money ...,124,25,24
5192,"Good evening, Can friends transfer money to ea...",2-11-2022,en,others,friend transfer money,68,13,13
5391,"hi, Why hasn't my transaction arrived? Kind Re...",25-12-2022,en,transfer,hasnt transact arriv,51,12,12
861,"Hello, Please check my payment from last Satur...",3-6-2022,en,card,pleas check payment last saturday feel overcha...,131,26,24


### Useful selectors (From BLU09)

In [8]:
class Selector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a column from the dataframe to perform additional transformations on
    """ 
    def __init__(self, key):
        self.key = key
        
    def fit(self, X, y=None):
        return self

class TextSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def transform(self, X):
        return X[self.key]

class NumberSelector(Selector):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def transform(self, X):
        return X[[self.key]]

# 1. Multiclass model

## 1.1. Classifier

### Text baseline

In [9]:
# Build the pipeline
text = Pipeline([('selector', TextSelector("clean_text")),
                 ('tfidf', TfidfVectorizer())])

text_pipeline = Pipeline([('features', text), ('classifier', RandomForestClassifier(random_state = 42))])

In [10]:
# Train the classifier
text_classifier = clone(text_pipeline)
text_classifier.fit(train, train['class'].values)

In [11]:
text_predicted = text_classifier.predict(val)

### Combining text and numerical columns

In [12]:
# Build the pipeline
numerical_columns = ["characters_count", "words_count", "unique_words_count"]

numeric_steps = []
for feature in numerical_columns:
    pipeline_step = Pipeline([(feature, NumberSelector(feature)), (f'scaled_{feature}', StandardScaler())])
    numeric_steps.append((feature, pipeline_step))
    
combined_steps = numeric_steps + [('text', text)]
combined_features = FeatureUnion(combined_steps)

combined_pipeline = Pipeline([('features', combined_features),
                              ('classifier', RandomForestClassifier(random_state=42))])

In [13]:
# Train the classifier
combined_classifier = clone(combined_pipeline)
combined_classifier.fit(train, train["class"].values)

In [14]:
combined_predicted = combined_classifier.predict(val)

In [15]:
combined_predicted

array(['card', 'others', 'others', ..., 'card', 'card', 'card'],
      dtype=object)

# 1.2. Metrics

#### Calculate metrics for each label and find their unweighted mean

In [16]:
def get_and_print_macro_metrics(true_labels, predictions, average):
    precision = precision_score(true_labels, predictions, average=average)
    recall = recall_score(true_labels, predictions, average=average)
    f1 = f1_score(true_labels, predictions, average=average)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

In [17]:
print("---- Text based classifier metrics ----")
get_and_print_macro_metrics(val['class'], text_predicted, "macro")

---- Text based classifier metrics ----
Precision: 0.9251317935502008
Recall: 0.9074828045092908
F1-Score: 0.9158778068900697


In [18]:
print("---- Combined classifier metrics ----")
get_and_print_macro_metrics(val['class'], combined_predicted, "macro")

---- Combined classifier metrics ----
Precision: 0.9174532648883245
Recall: 0.9014810739407982
F1-Score: 0.9090632920214665


#### Calculate metrics for each class

In [19]:
def get_and_print_macro_metrics_by_class(true_labels, predictions, average, class_):
    precision = precision_score(true_labels, predictions, average=average, labels=[class_])
    recall = recall_score(true_labels, predictions, average=average, labels=[class_])
    f1 = f1_score(true_labels, predictions, average=average, labels=[class_])
    
    print(f"---- {class_} ----")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

In [20]:
def metrics(class_):
    get_and_print_macro_metrics_by_class(val['class'], predicted, "macro", class_)

In [21]:
print("---- Text based classifier metrics ----")
predicted = text_predicted
text_classifier_metrics_by_class = list(map(metrics, (train_dataset["class"].unique())))

---- Text based classifier metrics ----
---- card ----
Precision: 0.9153094462540716
Recall: 0.934589800443459
F1-Score: 0.9248491497531541
---- others ----
Precision: 0.8784722222222222
Recall: 0.8939929328621908
F1-Score: 0.8861646234676007
---- transfer ----
Precision: 0.9220183486238532
Recall: 0.919908466819222
F1-Score: 0.9209621993127147
---- cash ----
Precision: 0.9558359621451105
Recall: 0.8964497041420119
F1-Score: 0.9251908396946565
---- security ----
Precision: 0.9540229885057471
Recall: 0.8924731182795699
F1-Score: 0.9222222222222222


In [22]:
print("---- Combined classifier metrics ----")
predicted = combined_predicted
combined_classifier_metrics_by_class_combined = list(map(metrics, (train_dataset["class"].unique())))

---- Combined classifier metrics ----
---- card ----
Precision: 0.9142857142857143
Recall: 0.9223946784922394
F1-Score: 0.9183222958057397
---- others ----
Precision: 0.8697142857142857
Recall: 0.8963486454652533
F1-Score: 0.8828306264501159
---- transfer ----
Precision: 0.9220183486238532
Recall: 0.919908466819222
F1-Score: 0.9209621993127147
---- cash ----
Precision: 0.9490445859872612
Recall: 0.8816568047337278
F1-Score: 0.9141104294478528
---- security ----
Precision: 0.9322033898305084
Recall: 0.8870967741935484
F1-Score: 0.9090909090909092


# 2. Optional: Binary models per class (one vs. all)

## 2.1. Classifier

In [23]:
def binary_classifier(train, val, class_dict):
    train_ = train.copy()
    val_ = val.copy()
    
    # Change the class map
    train_["class"] = train_["class"].map(class_dict)
    val_["class"] = val_["class"].map(class_dict)
    
    # Build the pipeline
    classifier = Pipeline([('tfidf', TfidfVectorizer()), ('classifier', RandomForestClassifier(random_state = 42))])
    
    # Train the classifier
    classifier.fit(map(str, train_['clean_text'].values), train_['class'].values)
    
    # Predict labels
    predicted = classifier.predict(map(str, val_['clean_text'].values))
    
    return val_['class'], predicted

## 2.2. Metrics

In [24]:
def get_and_print_binary_metrics(train, val, class_dict):
    
    true_labels, predictions = binary_classifier(train, val, class_dict)
    
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")

In [25]:
def class_dic(class_):
    print(f"---- {class_} ----")
    
    class_dict = {"others": 0, "card": 0, "cash": 0, "transfer": 0, "security": 0}
    class_dict.update({class_: 1})
    
    get_and_print_binary_metrics(train, val, class_dict)

In [26]:
classes = train_dataset["class"].unique()
classes

array(['card', 'others', 'transfer', 'cash', 'security'], dtype=object)

In [27]:
print("---- Metrics ----")
binary_classifier_metrics = list(map(class_dic, (classes)))

---- Metrics ----
---- card ----
Precision: 0.9364161849710982
Recall: 0.8980044345898004
F1-Score: 0.9168081494057724
---- others ----
Precision: 0.951885565669701
Recall: 0.8621908127208481
F1-Score: 0.9048207663782448
---- transfer ----
Precision: 0.9493670886075949
Recall: 0.8581235697940504
F1-Score: 0.9014423076923077
---- cash ----
Precision: 0.9732441471571907
Recall: 0.8609467455621301
F1-Score: 0.913657770800628
---- security ----
Precision: 0.9813664596273292
Recall: 0.8494623655913979
F1-Score: 0.9106628242074929


Conclusions:
    - The experience combining text and numeric columns has shown that the numeric columns do not add much. So we will use just the text column.
    - The best approach based on the f1 metric was the multiclass model for all classes.