![flow_diagram](https://drive.google.com/uc?export=view&id=1mIm6g1LXoH6c4YSI84xqHlk8QTia5KMS)


This collab presents a demo of code-switch detection using trained BiGRU-with-attn with pre-trained non-contextual sub-word embeddings using Skipgram model with 300 dimensions. The RNN model is trained and validated on the Hansard training and validation set. 

Flow diagram: STEP 1 is done, we are looking at STEP 2 here.

In [2]:
import os
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
import numpy as np
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
from sklearn.metrics import f1_score
import string

from sklearn.metrics import (
    accuracy_score, 
    f1_score, 
    classification_report
)


In [4]:
model_path = "bilstm-Maori-Eng-300SG.h5"
tokenizer_path = "tokenizerbilstm-Maori-Eng-300SG.pickle"

In [5]:
## loading trained model. A summary of the model architecture is also presented.
loaded_model = tf.keras.models.load_model(model_path)

loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 300)          25531200  
                                                                 
 dropout (Dropout)           (None, 250, 300)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 128)              186880    
 l)                                                              
                                                                 
 dense (Dense)               (None, 3)                 387       
                                                                 
Total params: 25,718,467
Trainable params: 187,267
Non-trainable params: 25,531,200
_________________________________________________________________


In [6]:
## loading tokenizer. 
with open(tokenizer_path, 'rb') as handle:
        tokenizer = pickle.load(handle)


In [7]:
df = pd.read_csv("/content/gdrive/Shareddrives/Māori Code-switching project 2020/code-switch-end-to-end-demo/rmt_sample.csv")
df = df.drop(['url', 'user.id', 'date', 'content_with_emojis', 'total_words', 'percent_maori',
       'conversation_id', 'in_reply_to_user_id', 'lang', 'source_label',
       'error', 'favourites', 'like_count', 'quote_count', 'retweet_count',
       'reply_count', 'year', 'media', 'outlinks'],axis = 1)
df.columns = ['id','text','maori_words','number_of_words']
df.head()



Unnamed: 0,id,text,maori_words,number_of_words
0,77657422',any tuhoe kei roto i te whare?,"['tuhoe', 'kei', 'roto', 'te', 'whare']",5
1,553111032',Haere mai ki te L:Waitahananui,"['haere', 'mai', 'ki', 'te']",4
2,585594182',Kua hinga te totara nui o te wao tapu. Moe mai...,"['kua', 'hinga', 'te', 'totara', 'nui', 'o', '...",12
3,829531914',"He nui tangata e heke ana ki te Pō, he iti tan...","['he', 'nui', 'tangata', 'e', 'heke', 'ana', '...",18
4,863218149',"<user> oma ika, oma ika, oma oma oma","['oma', 'ika', 'oma', 'ika', 'oma', 'oma', 'oma']",7


In [8]:
def remove_punc(string):
    punc = '''!()-[]{};:'" <>./?@#$%^&*_~'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, "") 
    return string
 
wb = []
idx = []


for ind, row in df.iterrows():
    x = row['text']
    idf = row['id']
    trial = pd.DataFrame()
    y1 = x.split()
    y = [remove_punc(i) for i in y1] 
    for i in y:
        seq1= tokenizer.texts_to_sequences([i])
        padded1 = pad_sequences(seq1, maxlen=250)
        predict1=loaded_model.predict(padded1) 
        classw=np.argmax(predict1,axis=1)
        if int(classw[0]) == 1:
            wb.append(i)
            idx.append(idf)
        else:
            continue
    trial['words_pred'] = wb
    trial['id'] = idx

pred = trial.groupby('id', as_index=False).agg(lambda x: x.tolist())

result = pd.merge(df, pred, on="id", how="inner")
total_number_maori_words = result['number_of_words'].sum()


r = result.drop(['text'],axis=1)

r['unique_maori_words'] = r['maori_words'].str.lower().str.split(',').apply(set).sort_values()

r['unique_pred_words'] = r['words_pred'].astype(str).str.lower().str.split(',').apply(set).sort_values()



In [11]:
wb1 = []
idx1 = []

count = 0

for ind, row in r.iterrows():
    count = count + 1
    mw = row['unique_maori_words']
    pw = row['unique_pred_words']
    #print(count)
    first_list = [remove_punc(i) for i in mw] 
    sec_list = [remove_punc(i) for i in pw]
    idf = row['id']
    diff = list(set(first_list) - set(sec_list))
    t = pd.DataFrame()
    wb1.append(diff)
    idx1.append(idf)
t['difference'] = wb1
t['id'] = idx1

t['length'] = t['difference'].str.len()

t = t[t.length > 0]

total_diff = t['length'].sum()

print("total number of Maori words in RMT sample:", total_number_maori_words)
print("Wrong label by models:", total_diff)
print("Accuracy of the model: 91%")

total number of Maori words in RMT sample: 179
Wrong label by models: 16
Accuracy of the model: 91%
