# ULMFiT test

In [15]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

In [16]:
data = pd.read_csv('intents_en.csv')
data['text'] = data['text_en']
data.drop(['text_en', 'marker'],axis=1, inplace=True)
print(data.head())

                         label  \
0  Как заблокировать сим-карту   
1  Как заблокировать сим-карту   
2  Как заблокировать сим-карту   
3  Как заблокировать сим-карту   
4  Как заблокировать сим-карту   

                                                text  
0                                Block this sim card  
1  Hello, I want to block this sim card, how to d...  
2                            Can I block a sim card?  
3                Hello. I want to block the sim card  
4                            how to block a sim card  


In [22]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kolsha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
print(data.head())

                         label  \
0  Как заблокировать сим-карту   
1  Как заблокировать сим-карту   
2  Как заблокировать сим-карту   
3  Как заблокировать сим-карту   
4  Как заблокировать сим-карту   

                                                text  
0                                Block this sim card  
1  Hello, I want to block this sim card, how to d...  
2                            Can I block a sim card?  
3                Hello. I want to block the sim card  
4                            how to block a sim card  


In [24]:
df = data.copy()
df['text'] = df['text'].str.replace("[^a-zA-Zа-яА-Я]", " ")

le = preprocessing.LabelEncoder()
le.fit(df['label'].values)
df['label'] = le.transform(df['label'].values)

In [25]:
print(df.head())

   label                                               text
0      1                                Block this sim card
1      1  Hello  I want to block this sim card  how to d...
2      1                            Can I block a sim card 
3      1                Hello  I want to block the sim card
4      1                            how to block a sim card


In [26]:
print(stop_words)
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
print(df.head())

   label                         text
0      1               Block sim card
1      1  Hello I want block sim card
2      1         Can I block sim card
3      1  Hello I want block sim card
4      1               block sim card


In [43]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.1, random_state = 42)

In [44]:
df_trn.shape, df_val.shape

((789, 2), (88, 2))

In [45]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [46]:
learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.9)

In [47]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(10, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,8.525010,7.734663,0.076116
2,8.511082,7.684545,0.082366
3,8.529038,7.641004,0.077232
4,8.496454,7.602447,0.079911
5,8.473711,7.516505,0.076563
6,8.445696,7.511761,0.079911
7,8.412202,7.422843,0.083036
8,8.402211,7.456094,0.079018
9,8.382835,7.415691,0.082366
10,8.367177,7.378826,0.083259


In [48]:
learn.save_encoder('ft_enc')

In [49]:
learn = text_classifier_learner(data_clas, drop_mult=0.9)
learn.load_encoder('ft_enc')

In [50]:
learn.fit_one_cycle(10, 1e-3)

epoch,train_loss,valid_loss,accuracy
1,2.372224,2.277485,0.090909
2,2.274985,1.980194,0.579545
3,2.121575,1.615075,0.659091
4,1.905067,1.369346,0.693182
5,1.732610,1.174521,0.704545
6,1.590267,1.069598,0.761364
7,1.488598,0.976185,0.738636
8,1.491502,0.954593,0.704545
9,1.412291,0.942735,0.750000
10,1.387349,0.950298,0.738636


In [51]:
# get predictions
preds, targets = learn.get_preds()

# predictions = np.argmax(preds, axis = 1)
# pd.crosstab(predictions, targets)

In [52]:
print(np.array(targets))
print(df.loc[1]['text'])

[4 7 0 0 ... 1 0 7 2]
Hello I want block sim card


In [73]:
text = 'I do not want to talk to the bot, give a living person'
pred= learn.predict(text)
print(pred)
print(le.inverse_transform([int(pred[0])]))


(Category 1, tensor(1), tensor([0.1164, 0.1751, 0.0683, 0.0476, 0.0448, 0.1212, 0.1367, 0.1213, 0.1220,
        0.0466]))
['Как заблокировать сим-карту']


In [41]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=42) #, shuffle=True

In [42]:
f1_mac_best = 0
f1_mic_best = 0
best_ulm = None
f1_mac_all = []
for (train, test) in cv.split(df['text'], df['label']):
    print(f1_mac_all)
    # Language model data
    data_lm = TextLMDataBunch.from_df(train_df = df.loc[train], valid_df = df.loc[test], path = "")

    # Classifier model data
    data_clas = TextClasDataBunch.from_df(path = "",
                                          train_df = df.loc[train],
                                          valid_df = df.loc[test],
                                          vocab=data_lm.train_ds.vocab, bs=32)
    
    learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.9)
    
    
    # train the learner object with learning rate = 1e-2
    learn.fit_one_cycle(1, 1e-3)
    
    learn.save_encoder('ft_enc')
    
    learn = text_classifier_learner(data_clas, drop_mult=0.9)
    learn.load_encoder('ft_enc')
    
    learn.fit_one_cycle(1, 1e-3)

    
    y_pred = np.array(learn.get_preds()[1])
    
    f1_mac = f1_score(df.loc[test]['label'], y_pred, average='macro')
    f1_mic = f1_score(df.loc[test]['label'], y_pred, average='micro')
    
    f1_mac_all.append(f1_mac)
    print("F1 Macro: {}".format(f1_mac) )
    print("F1 Micro: {}".format(f1_mic) )
    
#     break
    
    if f1_mac > f1_mac_best:
#         best_ulm = copy.copy(learn)
        f1_mac_best = f1_mac
        f1_mic_best = f1_mic
    
print("BEST F1 Macro: {}".format(f1_mac_best) )
print("BEST F1 Micro: {}".format(f1_mic_best) )

f1_mac_all = np.array(f1_mac_all)
f1_mac_avg = f1_mac_all.mean()
print("AVG  F1 Macro: {}".format(f1_mac_avg) )
print(f1_mac_all.std())

F1 Macro: 0.1194732411837675
F1 Micro: 0.1590909090909091
BEST F1 Macro: 0.18349219007113743
BEST F1 Micro: 0.2159090909090909
AVG  F1 Macro: 0.11472399432925748
0.03156019808651784


## 