In [16]:
import nltk
import re
import multiprocessing as mp
import plotly.express as px
from string import punctuation
import import_ipynb
import prepro

# This file carries out POS tagging and places them in a graph

In [3]:
# Creates a new file, takes the base concordance file and runs POS tagging.
# This code strips the concordance down to left and right of the query (word concordance query)
def concordance_word_type(query, wordtype):
    stripped_string = []

    with open('../March-No-Retweets/Concordance/'+ wordtype+'/concordance_'+ query + '.txt', 'r', encoding='utf8') as f:
        for line in f.readlines():
            stripped_string.append(nltk.word_tokenize(line.rstrip("\n")))

    return posTag(stripped_string, query)

In [9]:
def posTag(stripped_string, query):
    # POS tag the entire concordance.
    k=[]
    for line in stripped_string:
        token = nltk.pos_tag(line) #nltk.word_tokenize(line))
        k.append(token)

    nouns = getNouns(k, query)

    #verbs = getVerbs(k, query)
    #adj = get_adjective(token, query)
    #all_types = get_all(k, query)
    return nouns

In [86]:
def get_all(token, query):
    exclude = ["hundred", "thousand", "dont", "they", "theyre", "would", "cant"]
    most_common_any = []

    for ls in token:
        for i, j in ls:

            if i not in exclude:
                if len(i) >= 4:
                    most_common_any.append(i)

    any_frq = nltk.FreqDist(any for any in most_common_any)
    most_common_any = any_frq.most_common(20)
    return most_common_any

In [5]:
# get all nouns and return the 20 most common
def getNouns(token, query):
    exclude = ["hundred", "thousand", "dont", "they", "theyre", "would", "cant"]
    most_common_nouns = []

    for ls in token:
        for i, j in ls:
            if i not in exclude:
                if 'NN' in j and len(i) > 4:
                    most_common_nouns.append(i)

    noun_frq = nltk.FreqDist(noun for noun in most_common_nouns)
    most_common_nouns = noun_frq.most_common(20)
    return most_common_nouns

In [52]:
# get all verbs and return the 20 most common
def getVerbs(token, query):
    exclude = ["hundred", "thousand"]
    most_common_verbs = []

    for ls in token:
        for i, j in ls:
            if i not in exclude:
                if 'V' in j and len(i) >= 4:
                    most_common_verbs.append(i)

    verb_frq = nltk.FreqDist(verb for verb in most_common_verbs)
    most_common_verbs = verb_frq.most_common(20)

    return most_common_verbs

In [19]:
def get_adjective(token, query):
    most_common_adj = []
    for i, j in token:
        if 'J' in j and len(i) >= 3:
            most_common_adj.append(i)
    adj_frq = nltk.FreqDist(verb for verb in most_common_adj)
    most_common_adj = adj_frq.most_common(20)

    return most_common_adj

In [6]:
# Function to sort the list of tuples by its second item
def Sort_Tuple(tup): 
    # getting length of list of tuples
    lst = len(tup)
    for i in range(0, lst):
        for j in range(0, lst-i-1):
            if (tup[j][1] > tup[j + 1][1]): 
                temp = tup[j] 
                tup[j]= tup[j + 1] 
                tup[j + 1]= temp 
    return tup

# Function to remove single letter 'e' which cropped up in Russian dataset.
def remove_single_letter_words(tup):
    for i in tup:
        if i[0] == 'е':
            tup.remove(i)
    return tup

In [None]:
import plotly.express as px
import pandas as pd

# Function which creates a dataframe containing
# the verb tuple and associated pronoun. 
def dataframer(words_tuple, pnoun):
    data = {}
    counter = []
    words = []

    for i in words_tuple:
        j, k = i
        words.append(j)
        counter.append(k)

    for pronoun in range(len(words)):
        data["Pronoun"] = pnoun
        data["Word"] = words
        data["Count"] = counter

    df_new = pd.DataFrame(data=data)
    return df_new

In [None]:
# Function to automate Concordance and POS tagging
# Takes a list of words which are the relevant files the concordance was carried out on.
def automatic_concordance_stripping(vocab):
    ls = []
    for i in vocab:
        ls_tup = concordance_word_type(i, 'Verbs')[1:]
        ls.append((Sort_Tuple(ls_tup), i))

    return ls

Below function creates a graph based on the tuple of data provided.
The end goal is to produce a stacked bar chart displaying the frequency
of commonly occuring words associated with the concorded word.
For example 

- [a list containing a tuple]
- (a tuple containing a list of words and their frequency, and the number of results the user wants to see)

** Need to return the result as a percentage not a count **


In [None]:
# Function takes a tuple of the (list of words) and the (concorded word) and the most frequent (X)
# That the user wants returned.
# Returns a stacked bar chart of the processed data.
def bar_stacked(list_of_tuples, x, title):
    x = -abs(x)
    df = pd.DataFrame(columns=["Pronoun", "Word", "Count"])
    # i_df = dataframer(verbs_I[-5:], "I")
    new = df

    for i, j in list_of_tuples:
        temp = dataframer(i[x:], j)
        new = new.append(temp)

    new = new.sort_values(by=['Count'], ascending=True)

    fig = px.bar(new, x="Pronoun", y="Count", color="Pronoun", title=title, text=new['Word'], color_discrete_sequence=px.colors.sequential.Viridis)
    fig.update_layout(hovermode="closest")

    fig.write_html("../Images/pronoun-nouns-file.html")

    fig.show()

In [None]:
pronouns =["I", "They", "We", "My", "Us", "You", "Me"]

ls_pronouns = automatic_concordance_stripping(pronouns)

In [None]:
ls_of_pros = []

for i in ls_pronouns:
    ls_of_pros.append(i)
    
bar_stacked(ls_of_pros, 10, "10 Most Common Verbs Found Alongside English Pronouns")

In [57]:
bar_stacked(ls_of_pros, 10, "10 Most Common Verbs Found Alongside English Pronouns")

In [11]:
vocab = ["believe", "think", "know", "tell", "need", "want", "read", "spreading", "stop", "give"]

ls_vrb_adj = automatic_concordance_stripping(vocab)

In [18]:
ls_of_vars = []

for i in ls_vrb_adj:
    ls_of_vars.append(i)
    
bar_stacked(ls_of_vars, 10, "10 Most Common Nouns Alongside The Most Frequent Verbs From Concorded Pronouns.")