# Language Classification with Naive Bayes in Python

In [1]:
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np
import string

from collections import defaultdict

from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

import joblib
import pickle as pkl
# from yellowbrick.classifier import ConfusionMatrix

<a id='task1'></a>
# Data Exploration and Visualization

In [2]:
def open_file(filename):
    with open(filename, 'r',encoding="utf8") as f:
        data = f.readlines()
    return data

In [3]:
data_raw = dict()
data_raw['sk'] =  open_file('content/train_sentences.sk')
data_raw['cs'] =  open_file('content/train_sentences.cs')
data_raw['en'] =  open_file('content/train_sentences.en')

In [4]:
data_raw['en'][50]

'The European Union is back on its feet.\n'

In [5]:
def show_statistics(data):
    for language, sentences in data.items():
        
        word_list = ' '.join(sentences).split()
        
        number_of_sentences = len(sentences)
        number_of_words = len(word_list)
        number_of_unique_words = len(set(word_list))
        sample_extract = ''.join(sentences[0].split(' ')[:7])
    
        print(f'Language: {language}')
        print('-----------------------')
        print(f'Number of sentences\t:\t {number_of_sentences}')
        print(f'Number of words\t\t:\t {number_of_words}')
        print(f'Number of unique words\t:\t {number_of_unique_words}')
        print(f'Sample extract\t\t:\t {sample_extract}...\n')

In [6]:
show_statistics(data_raw)

Language: sk
-----------------------
Number of sentences	:	 100
Number of words		:	 2016
Number of unique words	:	 1322
Sample extract		:	 PándeGrandesPascualjasnevysvetlil,aká...

Language: cs
-----------------------
Number of sentences	:	 10
Number of words		:	 158
Number of unique words	:	 141
Sample extract		:	 Upozorňujeme,žejejímcílemješetřitpenězi...

Language: en
-----------------------
Number of sentences	:	 100
Number of words		:	 2381
Number of unique words	:	 1037
Sample extract		:	 Icanunderstandyourapproachalittle...



<a id='task2'></a>
# Data Cleaning and Preprocessing

In [7]:
def text_process(text):
    
    preprocessed_text = text
    preprocessed_text = text.lower().replace('-',' ')
    translation_table = str.maketrans('\n',' ', string.punctuation+string.digits)
    preprocessed_text = preprocessed_text.translate(translation_table)
    
    return preprocessed_text

In [8]:
data_preprocessed = {k: [text_process(sentence) for sentence in v] for k, v in data_raw.items()}

In [9]:
print('ORIGINAL STATISTICS')
show_statistics(data_raw)
print('PREPROCESSED STATISTICS :')
show_statistics(data_preprocessed)

ORIGINAL STATISTICS
Language: sk
-----------------------
Number of sentences	:	 100
Number of words		:	 2016
Number of unique words	:	 1322
Sample extract		:	 PándeGrandesPascualjasnevysvetlil,aká...

Language: cs
-----------------------
Number of sentences	:	 10
Number of words		:	 158
Number of unique words	:	 141
Sample extract		:	 Upozorňujeme,žejejímcílemješetřitpenězi...

Language: en
-----------------------
Number of sentences	:	 100
Number of words		:	 2381
Number of unique words	:	 1037
Sample extract		:	 Icanunderstandyourapproachalittle...

PREPROCESSED STATISTICS :
Language: sk
-----------------------
Number of sentences	:	 100
Number of words		:	 1996
Number of unique words	:	 1207
Sample extract		:	 pándegrandespascualjasnevysvetlilaká...

Language: cs
-----------------------
Number of sentences	:	 10
Number of words		:	 155
Number of unique words	:	 133
Sample extract		:	 upozorňujemežejejímcílemješetřitpenězi...

Language: en
-----------------------
Number of sentences	

<a id='task3'></a>
# The Naive Bayes Model

**Bayes' Theorem**

\begin{equation}
P(A | B)=\frac{P(B | A) \times P(A)}{P(B)}
\end{equation}

Now, let's translate this theory into our specific problem. In our case, where we want to categorise a sentence `my name is Sudarshan` into one of `sk`, `cs`, or `en`, the following are the probabilities we want to determine.

\begin{equation}
P(\text {sk} | \text {my name is Sudarshan})=\frac{P(\text {my name is Sudarshan} | \text {sk}) \times P(\text {sk})}{P(\text {my name is Sudarshan})}
\end{equation}

\begin{equation}
P(\text {cs} | \text {my name is Sudarshan})=\frac{P(\text {my name is Sudarshan} | \text {cs}) \times P(\text {cs})}{P(\text {my name is Sudarshan})}
\end{equation}

\begin{equation}
P(\text {en} | \text {my name is Sudarshan})=\frac{P(\text {my name is Sudarshan} | \text {en}) \times P(\text {en})}{P(\text {my name is Sudarshan})}
\end{equation}

## Unseen Data

Since we assume conditional independence across our features, our numerator term for any of the above equations can be broken into the following.

\begin{equation}
P(\text {my name is Sudarshan} | \text {en}) = P(\text {my} | \text {en}) \times P(\text {name} | \text {en}) \times P(\text {is} | \text {en}) \times P(\text {Sudarshan} | \text {en})
\end{equation}

## Vectorizing Training Data

|Sentence   	||   my   	| is 	| I 	| love 	| name 	| it 	| Sudarshan 	|
|-----------------	||:------:	|:--:	|:-:	|:----:	|:----:	|:--------:	|:---:	|
| my name is Sudarshan  	||    1   	|  1 	| 0 	|   0  	|   1  	|     0    	|  1  	|
| I love it 	||    0   	|  0 	| 1 	|   1  	|   0  	|     1    	|  0  	|

In [10]:
sentences_train, y_train =[], []
for k, v in data_preprocessed.items():
    for sentence in v:
        sentences_train.append(sentence)
        y_train.append(k)
        

In [11]:
vectorizer = CountVectorizer()

In [12]:
x_train = vectorizer.fit_transform(sentences_train)
x_train

<210x2208 sparse matrix of type '<class 'numpy.int64'>'
	with 3867 stored elements in Compressed Sparse Row format>

## Initializing Model Parameters and Training

In [13]:
naive_bayes = MultinomialNB()
naive_bayes.fit(x_train,y_train) #alpha is smoothening constant, takes into account unseen literals like name - 'Sudarshan'

## Vectorizing Validation Data and Evaluating Model

In [14]:
data_val = dict()
data_val['sk'] = open_file('val_sentences.sk')
data_val['cs'] = open_file('val_sentences.cs')
data_val['en'] = open_file('val_sentences.en')

data_val_preprocessed = {k: [text_process(sentence) for sentence in v] for k,v in data_val.items()}

In [15]:
sentences_val, y_val = [], []
for k,v in data_val_preprocessed.items():
    for sentence in v:
        sentences_val.append(sentence)
        y_val.append(k)

In [16]:
x_val = vectorizer.transform(sentences_val)

In [17]:
predictions = naive_bayes.predict(x_val)
predictions

array(['sk', 'sk', 'sk', ..., 'en', 'en', 'en'], dtype='<U2')

In [18]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, predictions, labels=['sk','cs','en']))

[[4886    0  114]
 [4077  534  389]
 [   0    0 5000]]


In [19]:
f1_score(y_val, predictions, average='weighted')

0.6149824401040264

<a id='task4'></a>
# Simple Adjustments and Highlighting Model Shortcomings

In [57]:
naive_bayes = MultinomialNB(alpha= 0.0001, fit_prior = False) 
#alpha takes care for values which are not present in language dictionary.
# fit_prior is false to avoid prior probability otherwise prob will be biased towards 'sk' and 'eng'.Since cs has less sentences.
naive_bayes.fit(x_train,y_train)
predictions = naive_bayes.predict(x_val)
#confusion_matrix(y_val,predictions, ['sk','cs','en'])
confusion_matrix(y_val, predictions, labels=['sk','cs','en'])


array([[4847,  135,   18],
       [3688, 1214,   98],
       [  18,   55, 4927]], dtype=int64)

In [58]:
f1_score(y_val, predictions, average='weighted')

0.6918616511767245

<a id='task5'></a>
# Using Subwords to Shift Perspective

**Dummy Dataset**

playing ; eating ; play ; reads ; tea

**Step 1**

Break each word into characters

playing > p l a y i n g


**Step 2**

Find common character sequences

ea, ing, play

**Step 3**

Convert dataset using these subwords into

play ing ; ea t ing ; play ; r ea d s ; t ea

## get_vocab Function :
1. Takes in sentence_train as input and splits each sentence into words.
2. Followed by splitting each word into letters and storing them in a dict with their frequency.


## get_stats Function :
1. Takes in vocab dictionary as input.
2. Split each word into letters and store it in symbols.
3. Form a dictionary with keys as two frequent letters and values with their frequency.
4. Return the pair frequency.


In [59]:
# taken from https://arxiv.org/abs/1508.07909

import re, collections
def get_stats(vocab):
    pairs = collections.defaultdict(int) 
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq 
    return pairs

def merge_vocab(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word] 
    return v_out

In [60]:
def get_vocab(data):

    words = []
    for sentence in data:
        words.extend(sentence.split())
     
    vocab = defaultdict(int)
    for word in words:
        vocab[' '.join(word)] += 1
        
    return vocab

In [61]:
vocab = get_vocab(sentences_train)
# vocab

In [62]:
# also taken from original paper
for i in range(100):
    pairs = get_stats(vocab)
#     print(pairs)
    best = max(pairs, key=pairs.get)
#     print(best)
    vocab = merge_vocab(best, vocab)
     

In [63]:
merges = defaultdict(int)
for k, v in vocab.items():
    for subword in k.split():
        if len(subword) >=2:
            merges[subword] += v

In [64]:
merge_ordered = sorted(merges, key=merges.get, reverse=True)

In [65]:
pkl.dump(merge_ordered, open('merge_ordered.pkl', 'wb'))

In [66]:
def split_into_subwords(text):
    merges = pkl.load(open('merge_ordered.pkl', 'rb'))
    subwords = []
    for word in text.split():
        for subword in merges:
            subword_count = word.count(subword)
            if subword_count > 0:
                word = word.replace(subword, ' ')
                subwords.extend([subword]*subword_count)
    return ' '.join(subwords)

In [67]:
split_into_subwords('Hello My name is Sudarshan')

'lo na me is an ar'

In [68]:
data_preprocessed_subwords = {k: [split_into_subwords(sentence) for sentence in v] for k,v in data_preprocessed.items()}

In [69]:
show_statistics(data_preprocessed_subwords)

Language: sk
-----------------------
Number of sentences	:	 100
Number of words		:	 3431
Number of unique words	:	 75
Sample extract		:	 deandealasneas...

Language: cs
-----------------------
Number of sentences	:	 10
Number of words		:	 239
Number of unique words	:	 59
Sample extract		:	 pojemeoržejele...

Language: en
-----------------------
Number of sentences	:	 100
Number of words		:	 3863
Number of unique words	:	 75
Sample extract		:	 anstanerouroch...



In [70]:
data_train_subwords = []
for sentence in sentences_train:
    data_train_subwords.append(split_into_subwords(sentence))

In [71]:
data_val_subwords = []
for sentence in sentences_val:
    data_val_subwords.append(split_into_subwords(sentence))


In [72]:
vectorizer = CountVectorizer()

In [73]:
x_train = vectorizer.fit_transform(data_train_subwords)
x_val = vectorizer.transform(data_val_subwords)

In [74]:
naive_bayes = MultinomialNB(alpha= 1.0, fit_prior = False)

In [75]:
naive_bayes.fit(x_train,y_train)
predictions = naive_bayes.predict(x_val)

In [76]:
confusion_matrix(y_val,predictions,labels=['sk','cs','en'])

array([[4684,  303,   13],
       [1799, 3134,   67],
       [  14,   61, 4925]], dtype=int64)

In [77]:
f1_score(y_val ,predictions, average='weighted')

0.8456381060126386

In [78]:
joblib.dump(naive_bayes, 'final_model.joblib')
joblib.dump(vectorizer, 'final_model.joblib')

['final_model.joblib']

### EVALUATING OUR LANGUAGE CLASSIFICATION MODEL

In [88]:
model = joblib.load('final_model.joblib')
vectorizer = joblib.load('final_model.joblib')

In [99]:
text = "czec a slovenština jsou velmi podobné jazyky." #czec and slovak are very similar languages.
text = text_process(text)
text = [split_into_subwords(text)]
text_vectorized = vectorizer.transform(text)
#print(text_vectorized)
#model.predict(text_vectorized)[0]

predictions = naive_bayes.predict(text_vectorized)[0]
predictions

'cs'

In [100]:
text = "Ahoj, pekný deň!" #Hello, have great day!.
text = text_process(text)
text = [split_into_subwords(text)]
text_vectorized = vectorizer.transform(text)

#model.predict(text_vectorized)[0]
naive_bayes.predict(text_vectorized)[0]

'cs'

In [101]:
text = "Hello mate! Glad you reached till the end!" 
text = text_process(text)
text = [split_into_subwords(text)]
text_vectorized = vectorizer.transform(text)

naive_bayes.predict(text_vectorized)[0]

'en'