In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
import os
import pandas
import pickle
import string
from tqdm.notebook import tqdm
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import plotly.express as px
import plotly.graph_objects as go
import itertools
from gensim.models import KeyedVectors

In [None]:
label_map = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
input_path = '/content/drive/MyDrive/stanford-natural-language-inference-corpus'
non_word_chars = set(string.ascii_lowercase) - {'a', 'i'}
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
lemmatizer = WordNetLemmatizer()
stop_words_list = list(stopwords.words('english'))

In [None]:
def plot_raw_distribution(df, datatype):
    colors = {
        'entailment': 'green',
        'contradiction': 'red',
        'neutral': 'yellow',
        '-': 'blue',
    }

    df = df.copy()
    label_samples = df.groupby('gold_label').pairID.count().sort_values(ascending=False).reset_index()
    label_samples['colour'] = label_samples.gold_label.apply(colors.get)
    fig = px.bar(label_samples, x='gold_label', y='pairID', title=f'Output Distribution - {datatype}',
                 color='colour', color_discrete_map='identity', text='pairID', text_auto='.3s',
                 category_orders={'gold_label': ['entailment', 'neutral', 'contradiction', '-']})
    fig.update_layout(xaxis_title='Gold Label', yaxis_title='Freq', showlegend=False, title_x=0.5)
    fig.update_traces(width=0.5)
    fig.show()


def plot_class_distribution(train_df, test_df):
    train_df, test_df = train_df.copy(), test_df.copy()

    plot_raw_distribution(train_df, 'Train Data')
    plot_raw_distribution(test_df, 'Test Data')

    train_df = train_df[train_df.gold_label != '-']
    test_df = test_df[test_df.gold_label != '-']

    df = pandas.DataFrame(index=['entailment', 'contradiction', 'neutral'])
    df['Train'] = train_df.groupby('gold_label').pairID.count().sort_values(ascending=False) / len(train_df)
    df['Test'] = test_df.groupby('gold_label').pairID.count().sort_values(ascending=False) / len(test_df)
    df.reset_index(inplace=True)

    fig = px.bar(df, x='index', y=['Train', 'Test'], title=f'Normalised Output Distribution', barmode='group', text_auto='.4p',
                 category_orders={'index': ['entailment', 'neutral', 'contradiction']})
    fig.update_layout(xaxis_title='Gold Label', yaxis_title='% age', legend_title_text='Data Split', title_x=0.5)
    fig.show()


In [None]:
def make_token_steps(sentence, steps):
    if steps == 1:
        tokens = sentence.replace('(', '').replace(')', '').split()  # Get the tokens generated by the stanford parser
    if steps == 2:
        tokens = list(map(lambda x: x.lower(), sentence))  # take lower because capital letters are not going to affect the inference
    if steps == 3:
        tokens = list(filter(lambda x: re.search('[a-zA-Z]', x), sentence))  # remove those tokens which are purely punctuation marks or numbers
    if steps == 4:
        tokens = list(filter(lambda x: x not in non_word_chars, sentence))  # remove characters which are not words in their own right
    if steps == 5:
        pos_tags = pos_tag(sentence)  # tag the tokens
        pos_tags = map(lambda pos_tag: tag_dict.get(pos_tag[1][0].upper(), wordnet.NOUN), pos_tags)  # convert pos tags for lemmatizer can use
        tokens = map(lambda token, pos_tag: lemmatizer.lemmatize(token, pos_tag), sentence, pos_tags)  # lemmatize the word with its pos tag
        tokens = list(tokens)  # perform the previous operations on demand
    if steps == 6:
        tokens = list(filter(lambda token: token not in stop_words_list, sentence))
    if steps > 6 or steps < 1:
        raise Exception()
    return tokens


steps = {
    1: 'Unprocessed',
    2: 'Lower',
    3: 'Punctuations/Numbers',
    4: 'Non-word characters',
    5: 'Lemmatization',
    6: 'Stop Words'
}


def get_funnel_df(df, datatype):
    df = df.copy()
    df = df[df.gold_label != '-']
    df.fillna({'sentence1_binary_parse': '', 'sentence2_binary_parse': ''}, inplace=True)
    df['premise_tokens'] = df['sentence1_binary_parse']
    df['hypothesis_tokens'] = df['sentence2_binary_parse']

    funnel_df = pandas.DataFrame(index=range(1, 7), columns=['Premise', 'Hypothesis'])
    funnel_df['Processing Stage'] = list(map(steps.get, funnel_df.index))

    for step in range(1, 7):

        tqdm.pandas(desc=f'{datatype} - Premise Processing - {steps[step]}')
        df['premise_tokens'] = df.premise_tokens.progress_apply(lambda sent: make_token_steps(sent, step))
        premise_tokens = set(itertools.chain.from_iterable(df.premise_tokens))
        funnel_df.loc[step, 'Premise'] = len(premise_tokens)

        tqdm.pandas(desc=f'{datatype} - Hypothesis Processing - {steps[step]}')
        df['hypothesis_tokens'] = df.hypothesis_tokens.progress_apply(lambda sent: make_token_steps(sent, step))
        hypothesis_tokens = set(itertools.chain.from_iterable(df.hypothesis_tokens))
        funnel_df.loc[step, 'Hypothesis'] = len(hypothesis_tokens)

    return funnel_df, df


def plot_tokens_funnel_graph(funnel_df, datatype):
    fig = px.funnel(funnel_df, y=['Premise', 'Hypothesis'], x='Processing Stage', title=f'Funnel Graph for {datatype}')
    fig.update_layout(yaxis_title='Unique Tokens', legend_title_text='Sentence Type', title_x=0.5)
    fig.show()


In [None]:
def plot_sentence_length_dist(df, datatype):
    df = df.copy()
    df['Premise'] = df.premise_tokens.apply(len)
    df['Hypothesis'] = df.hypothesis_tokens.apply(len)

    fig = px.histogram(df, x=['Premise', 'Hypothesis'], marginal='box', barmode='group', title=f'Sentence Length Distribution - {datatype}')
    fig.update_layout(xaxis_title="Sentence Length (Num. of tokens)", yaxis_title="Count", legend_title_text='Sentence Type', title_x=0.5)
    fig.show()

In [None]:
def tokens_word2vec(df, datatype):
    df = df.copy()
    word2vec_path = '/content/drive/MyDrive/GoogleNews-vectors-negative300.bin'
    word2vec_pretrained = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    word2vec_pretrained_dict = dict(zip(word2vec_pretrained.key_to_index.keys(), word2vec_pretrained.vectors))

    premise_tokens = set(itertools.chain.from_iterable(df.premise_tokens))
    hypothesis_tokens = set(itertools.chain.from_iterable(df.hypothesis_tokens))

    premise_presence = sum(map(lambda token: token in word2vec_pretrained_dict, premise_tokens)) / len(premise_tokens)
    hypothesis_presence = sum(map(lambda token: token in word2vec_pretrained_dict, hypothesis_tokens)) / len(hypothesis_tokens)

    print(f'{round(premise_presence * 100, 2)}% of {datatype} Premise tokens present in Word2Vec!')
    print(f'{round(hypothesis_presence * 100, 2)}% of {datatype} Hypothesis tokens present in Word2Vec!')


In [None]:
train_df = pandas.read_csv('/content/drive/MyDrive/stanford-natural-language-inference-corpus/snli_1.0_test.csv')
test_df = pandas.read_csv('/content/drive/MyDrive/stanford-natural-language-inference-corpus/snli_1.0_train.csv')

In [None]:
plot_class_distribution(train_df, test_df)

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
train_funnel_df, processed_train_df = get_funnel_df(train_df, 'Train Data')
test_funnel_df, processed_test_df = get_funnel_df(test_df, 'Test Data')

Train Data - Premise Processing - Unprocessed:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Unprocessed:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Premise Processing - Lower:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Lower:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Premise Processing - Punctuations/Numbers:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Punctuations/Numbers:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Premise Processing - Non-word characters:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Non-word characters:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Premise Processing - Lemmatization:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Lemmatization:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Premise Processing - Stop Words:   0%|          | 0/9824 [00:00<?, ?it/s]

Train Data - Hypothesis Processing - Stop Words:   0%|          | 0/9824 [00:00<?, ?it/s]

Test Data - Premise Processing - Unprocessed:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Unprocessed:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Premise Processing - Lower:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Lower:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Premise Processing - Punctuations/Numbers:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Punctuations/Numbers:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Premise Processing - Non-word characters:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Non-word characters:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Premise Processing - Lemmatization:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Lemmatization:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Premise Processing - Stop Words:   0%|          | 0/549367 [00:00<?, ?it/s]

Test Data - Hypothesis Processing - Stop Words:   0%|          | 0/549367 [00:00<?, ?it/s]

In [None]:
plot_tokens_funnel_graph(train_funnel_df, 'Train Data')
plot_tokens_funnel_graph(test_funnel_df, 'Test Data')

In [None]:
plot_sentence_length_dist(processed_train_df, 'Train Data')
plot_sentence_length_dist(processed_test_df, 'Test Data')

In [None]:
tokens_word2vec(processed_train_df, 'Train Data')
tokens_word2vec(processed_test_df, 'Test Data')

94.33% of Train Data Premise tokens present in Word2Vec!
95.26% of Train Data Hypothesis tokens present in Word2Vec!
80.41% of Test Data Premise tokens present in Word2Vec!
72.92% of Test Data Hypothesis tokens present in Word2Vec!
