In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
import matplotlib.pyplot as plt
import spacy
import string
from multiprocessing import Pool, cpu_count
from string import punctuation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
import nltk

#from gensim import corpora, models
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

# Load Train Data

In [3]:
path = "bun-data//train.txt"
data = pd.read_csv(path,  sep='\t', lineterminator='\r')
data.drop("id",axis=1,inplace=True)
data.dropna(how = "all",inplace=True)
data.fillna(" ",inplace=True)
data.head()

Unnamed: 0,turn1,turn2,turn3,label
0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others
1,When did I?,saw many times i think -_-,No. I never saw you,angry
2,By,by Google Chrome,Where you live,others
3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry
4,Just for time pass,wt do u do 4 a living then,Maybe,others


# Preprocess

## Process Emoji

In [4]:
import en_core_web_sm
import re
import spacy
from spacymoji import Emoji

In [5]:
nlp = spacy.load('en_core_web_sm')
emoji = Emoji(nlp)
nlp.add_pipe(emoji, first=True)

In [6]:
def multi_scrub_text(reviews):
    '''
    Function to merge text and emoji - utilizes multiprocessing for merge emoji
    INPUT:
        reviews: array-like, pandas DataFrame column containing review texts
    OUTPUT:
        lemmatized: pandas DataFrame column with merged texts
    '''
    lemmatized = []
    cpus = cpu_count() - 1
    pool = Pool(processes=cpus)
    lemmatized = pool.map(merge_emoji, reviews)
    pool.close()
    pool.join()
    return lemmatized


def merge_emoji(text):
    '''
    Function to merge emoji with text
    INPUT:
        text: string, text of review
        
    OUTPUT:
        merged text
    '''
    try:
        sen = re.sub(r"[\U00010000-\U0010ffff]","", text).strip()
    except:
        print(text)
    x = nlp(text)
    if x._.has_emoji:
        emoji = sen+" "+(" ".join(list(set([item[2] for item in x._.emoji]))))
        x = nlp(emoji)
    
    return x.text.lower()
    

In [7]:
data['lem_turn1'] = multi_scrub_text(data['turn1'])
data['lem_turn2'] = multi_scrub_text(data['turn2'])
data['lem_turn3'] = multi_scrub_text(data['turn3'])
data['turn'] = data['lem_turn1']+" "+data['lem_turn3'] 

In [8]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

## Precess punctuation & mispell words

In [9]:
from gensim.models import KeyedVectors

embeddings_index = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True,limit=500000)

In [10]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [11]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [12]:
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

In [13]:
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'don&apos;t':'does not',
                "can&apos;t":"can not",
                "i&apos;m":"I am",
                'knw':"know",
                'yess':"yes",
                'texte':"text",
                'humour':"humor",
                "frnd":"friend",
                "haa":"ha",
                "darle":"darling",
                "intreste":"intresting",
                "sry":"sorry",
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                "plzz":"plz",
                "thanku":"thx",
                "okkk":"ok",
                "iwant":"i want",
                'siri':"Siri",
                "doesnt":"does not",
                'ohk':"ok",
                'okk':"ok",
                }
mispellings, mispellings_re = _get_mispell(mispell_dict)


In [14]:
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]

    return mispellings_re.sub(replace, text)

In [15]:
## Precess the train text
data['turn'] = data['turn'].progress_apply(lambda x: replace_typical_misspell(x))
data['turn'] = data['turn'].progress_apply(lambda x: clean_text(x))
data['turn'] = data['turn'].progress_apply(lambda x: clean_numbers(x))
data['turn'] = data['turn'].progress_apply(lambda x: replace_typical_misspell(x))
sentences = data['turn'].progress_apply(lambda x: x.split())
to_remove = ['a','to','of','and','’',]

100%|██████████| 30160/30160 [00:00<00:00, 260922.55it/s]
100%|██████████| 30160/30160 [00:00<00:00, 152017.03it/s]
100%|██████████| 30160/30160 [00:00<00:00, 135982.98it/s]
100%|██████████| 30160/30160 [00:00<00:00, 248762.01it/s]
100%|██████████| 30160/30160 [00:00<00:00, 194447.58it/s]


In [16]:
## Check result
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 30160/30160 [00:00<00:00, 676211.34it/s]
100%|██████████| 30160/30160 [00:00<00:00, 680730.82it/s]
100%|██████████| 11556/11556 [00:00<00:00, 459329.59it/s]

Found embeddings for 59.88% of vocab
Found embeddings for  97.29% of all text





In [17]:
## Save processed data
data.to_csv("ProcessedData.csv",header=True,index=False)

### Build Embedding Matrix

In [18]:
data = pd.read_csv("ProcessedData.csv")

In [37]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [20]:
MAX_SEQUENCE_LENGTH = 70
MAX_NUM_WORDS = 95000
tokenizer = Tokenizer(MAX_NUM_WORDS)
tokenizer.fit_on_texts(data['turn'])

In [21]:
texts_index = tokenizer.texts_to_sequences(data['turn'])
texts_index_pad = pad_sequences(texts_index , maxlen=MAX_SEQUENCE_LENGTH)

In [22]:
texts_index_pad

array([[  0,   0,   0, ...,  29,  60, 123],
       [  0,   0,   0, ..., 160, 612,   2],
       [  0,   0,   0, ...,  75,   2, 265],
       ...,
       [  0,   0,   0, ..., 179,  21, 782],
       [  0,   0,   0, ...,   2, 486,  91],
       [  0,   0,   0, ..., 193,   9,   3]], dtype=int32)

In [1]:
from gensim.models import KeyedVectors



In [2]:
GOOGLENEWS_FILE_GLOVE = 'GoogleNews-vectors-negative300.bin'
EMBEDDING_LIMIT = 500000
embeddings_index = KeyedVectors.load_word2vec_format(GOOGLENEWS_FILE_GLOVE, binary=True,limit=EMBEDDING_LIMIT)

In [23]:
GOOGLENEWS_FILE_GLOVE = 'GoogleNews-vectors-negative300.bin'
MAX_NUM_WORDS = 95000
EMBEDDING_DIM = 300
EMBEDDING_LIMIT = 500000

from gensim.models import KeyedVectors
def load_and_generate_matrix_embedding(path, word_index):
    def get_coefs(word_, *arr):
        return word_, np.asarray(arr, dtype='float32')

    max_num_words = min(MAX_NUM_WORDS, len(word_index))+1
    matrix_embedding = np.zeros((max_num_words, EMBEDDING_DIM))
    
    embeddings_index = KeyedVectors.load_word2vec_format(GOOGLENEWS_FILE_GLOVE, binary=True,limit=EMBEDDING_LIMIT)
    
    for word,i in word_index.items():
        if i >= max_num_words:
                continue
        if word in embeddings_index.vocab:
            matrix_embedding[i] = embeddings_index.word_vec(word)
        else:
            matrix_embedding[i] = np.random.randn(EMBEDDING_DIM)
            
    return matrix_embedding


In [24]:
embedding_matrix = load_and_generate_matrix_embedding(GOOGLENEWS_FILE_GLOVE, tokenizer.word_index)
max_num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index))+1

## Model

In [25]:
from keras.models import Sequential
from keras.layers import *
from keras.utils.np_utils import to_categorical
from keras.initializers import Constant
from keras_self_attention  import SeqSelfAttention

In [26]:
model = Sequential()
model.add(Embedding(max_num_words,
                    EMBEDDING_DIM,
                    embeddings_initializer=Constant(embedding_matrix),
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=True))
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True,dropout=0.2,recurrent_dropout=0.2)))
model.add(Bidirectional(LSTM(64, return_sequences=True,recurrent_dropout=0.2)))
model.add(Dropout(0.25))
model.add(SeqSelfAttention(attention_activation='relu'))
model.add(Flatten())
#model.add(Attention(max_num_words))
model.add(Dense(units=4, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 70, 300)           3460800   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 70, 300)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 70, 256)           439296    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 70, 128)           164352    
_________________________________________________________________
dropout_1 (Dropout)          (None, 70, 128)           0         
_________________________________________________________________
seq_self_attention_1 (SeqSel (None, 70, 128)           8257      
_________________________________________________________________
flatten_1 (Flatten)          (None, 8960)             

## Train Model

In [27]:
y = pd.get_dummies(data['label']).values
X_train, X_test, y_train, y_test = train_test_split(texts_index_pad, y, test_size=0.2)

In [28]:
batch_size = 128
history = model.fit(X_train, y_train, epochs=5, batch_size=batch_size, verbose=1, validation_split=0.2)


Train on 19302 samples, validate on 4826 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Test Data

In [29]:
testpath = "bun-data//test.txt"
testdata = pd.read_csv(testpath,  sep='\t', lineterminator='\r')
testdata.drop("id",axis=1,inplace=True)
testdata.dropna(how = "all",inplace=True)
testdata.fillna(" ",inplace=True)
testdata.head()

Unnamed: 0,turn1,turn2,turn3,label
0,Hmm,What does your bio mean?,I don’t have any bio,others
1,What you like,very little things,Ok,others
2,Yes,How so?,I want to fuck babu,others
3,what did you guess,what what,fuck,others
4,We ?,of course we will!,What gender movies you like??,others


In [30]:
testdata['lem_turn1'] = multi_scrub_text(testdata['turn1'])
testdata['lem_turn2'] = multi_scrub_text(testdata['turn2'])
testdata['lem_turn3'] = multi_scrub_text(testdata['turn3'])
testdata['turn'] = testdata['lem_turn1']+". "+testdata['lem_turn3'] 

In [31]:
testdata['turn'] = testdata['turn'].progress_apply(lambda x: replace_typical_misspell(x))
testdata['turn'] = testdata['turn'].progress_apply(lambda x: clean_text(x))
testdata['turn'] = testdata['turn'].progress_apply(lambda x: clean_numbers(x))
testdata['turn'] = testdata['turn'].progress_apply(lambda x: replace_typical_misspell(x))

100%|██████████| 5509/5509 [00:00<00:00, 253635.20it/s]
100%|██████████| 5509/5509 [00:00<00:00, 145438.65it/s]
100%|██████████| 5509/5509 [00:00<00:00, 138653.22it/s]
100%|██████████| 5509/5509 [00:00<00:00, 248643.29it/s]


In [32]:
testX = tokenizer.texts_to_sequences(testdata['turn'].values)
testX = pad_sequences(testX, MAX_SEQUENCE_LENGTH)

In [33]:
y_hat = model.predict(testX)

In [34]:
y_hat

array([[9.9506183e-04, 3.5937759e-05, 9.9121606e-01, 7.7529722e-03],
       [5.3257559e-04, 1.3523049e-03, 9.9630642e-01, 1.8087635e-03],
       [8.2888198e-01, 1.4418376e-05, 1.6590360e-01, 5.1999702e-03],
       ...,
       [8.0465275e-04, 2.4661593e-04, 9.9828571e-01, 6.6293636e-04],
       [2.0359247e-03, 5.7735358e-04, 9.8764753e-01, 9.7391680e-03],
       [7.8668920e-05, 1.2653698e-04, 9.9843019e-01, 1.3646433e-03]],
      dtype=float32)

In [35]:
testy = pd.get_dummies(testdata['label']).values

In [36]:
accuracy_score(list(map(lambda x: np.argmax(x), testy)), list(map(lambda x: np.argmax(x), y_hat)))

0.8863677618442548