In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
import nltk, os, glob
import pandas as pd
import re
from num2words import num2words
import plotly.graph_objects as go

In [2]:
# Function to take file path and return path object which will get all files of that type in the directory.
# Takes two param: filename and filetype.
def file_to_path(file_dir, filetype):
    # Finds all files of the specified type in the directory
    txt_path = os.path.join(file_dir, '*.' + filetype)

    # Uses Glob with the txt_path variable to place all the file directories in a list
    file_list = glob.glob(txt_path)

    return file_list

In [3]:
# Function which takes a path object of concordance files and extracts the pronoun from each file.
# This then calls the function remove_concordance which extracts the raw line data from the concordance
# File.
def extract_processed_line_from_concordance(path):
    for f in path:
        pronoun_index = f.find('_')+1
        if pronoun_index != 0:
            pronoun = f[pronoun_index:].replace('.txt', '')

comparison_files = file_to_path('../textFiles/Pronoun/', 'txt')
extract_processed_line_from_concordance(comparison_files)

In [4]:
# Function to create corpus
def create_corpus(path):
    token_corpus = {}
    for f in path:
        with open(f, 'r', encoding="utf-8") as fs:
                text = fs.read()
                text = text.replace('\n', ' ')
                #text = preprocess(text)
                token_corpus[f] = text.replace('\n', ' ')
    return token_corpus

In [49]:
from nltk.corpus import words

stopword_es = nltk.corpus.stopwords.words('spanish')
exclude = ["thousand", "hundred", "they", "theyre", "dont"]

In [50]:
# Function to create comparison file
def create_comparison_str(path):
    token_file = ""

    for f in path:
        with open(f, 'r', encoding="utf-8") as fs:
            text = fs.read()
            text = text.replace('\n', ' ')
            text = text.split()

            for i in text:
                if len(i) >= 3:
                    if i not in stopword_es:
                        if i not in exclude:
                            token_file += i + " "
    return token_file

In [None]:
corpus_files = file_to_path('../March-No-Retweets/All/', 'txt')
token_corpus = create_corpus(corpus_files)
print(token_corpus)

In [18]:
comparisons=file_to_path('../March-No-Retweets/Pronoun/I', 'txt')
comparison_corpus = create_corpus(comparisons)

In [19]:
comparison_files = file_to_path('../March-No-Retweets/Pronoun/I', 'txt')
token_file = create_comparison_str(comparison_files)

In [51]:
# tranform pronoun file
def transform_each_pronoun_doc():
    all_pronoun_files_and_text = {}
    queries = ["I", "They", "We", "My", "Us", "You", "Me"]

    for query in queries:
        all_pronoun_files_and_text[query] = save_file_string(query)

    return all_pronoun_files_and_text

def save_file_string(query):
    base_path = '../March-No-Retweets/Pronoun/' + query
    path_to_files = file_to_path(base_path, 'txt')
    return create_comparison_str(path_to_files)

pronoun_dictionary = transform_each_pronoun_doc()

In [52]:
token_I = pronoun_dictionary.get("I")
token_They = pronoun_dictionary.get("They")
token_We = pronoun_dictionary.get("We")
token_My = pronoun_dictionary.get("My")
token_Us = pronoun_dictionary.get("Us")
token_You = pronoun_dictionary.get("You")
token_Me = pronoun_dictionary.get("Me")

In [53]:
# https://towardsdatascience.com/higher-accuracy-and-less-process-time-in-text-classification-with-lda-and-tf-idf-d2d949e344c3
# https://www.bogotobogo.com/python/NLTK/tf_idf_with_scikit-learn_NLTK.php
# https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

# The transform() function computes the tfidf frequency of each word in the bag of word.
# Now our aim is to compare the document D2 with D1. It means we want to see how many words
# of D1 match up with D2. Thats why we perform fit_transform() on D1 and then only the transform()
# function on D2 would apply the bag of words of D1 and count the inverse frequency of tokens in D2.
# This would give the relative comparison of D1 against D2.

#tokenizer=tokenize,
tfidf = TfidfVectorizer(min_df=1, stop_words = 'english' )

tfidf_matrix = tfidf.fit_transform(token_corpus.values())
feature_names = tfidf.get_feature_names()

response_I = tfidf.transform([token_I])
response_They = tfidf.transform([token_They])
response_We = tfidf.transform([token_We])
response_My = tfidf.transform([token_My])
response_Us = tfidf.transform([token_Us])
response_You = tfidf.transform([token_You])
response_Me = tfidf.transform([token_Me])

In [54]:
# Size of corpus
print(tfidf_matrix.shape)
df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names())

(8, 33394)


In [16]:
def get_ifidf_for_words(dct):
    tfidf_matrix= tfidf.transform(dct.values()).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    tfidf_scores_obj = dict(tfidf_scores)
    obj_sorted = sorted(tfidf_scores_obj.items(), key=lambda x: x[1], reverse=True)
    return obj_sorted

In [None]:
get_ifidf_for_words(token_corpus)

In [None]:
get_ifidf_for_words(comparison_corpus)

# TF-IDF:

The data consists of 8 days of tweets collected from the 24/03/2020 -> 01/04/2020

- The training corpus contains all tweets that DO NOT include the pronouns ["I", "They", "We", "My", "Us", "You", "Me"]
- The testing data contains all tweets that DO include the pronouns ["I", "They", "We", "My", "Us", "You", "Me"]

In [68]:
import numpy as np

list_of_transformations=[
response_I,
response_They,
response_We,
response_My,
response_Us,
response_You,
response_Me,
]

def tfidf_results(item, pronoun):
    data = {}
    words = []
    count = []

    for col in item.nonzero()[1]:
        words.append(feature_names[col])
        count.append(item[0, col])

        #word_column = "Word" + " " + pronouns[i]
        #tfidf_column = "TF-IDF" + " " + pronouns[i]

    for x in range(len(words)):
            data["Words-"+pronoun] = words
            data["TF-IDF-"+pronoun] = count

    df = pd.DataFrame(data=data)

    df_sorted = df.sort_values(by=["TF-IDF-"+pronoun], ascending=False).reset_index(drop=True)

    return df_sorted.head(11)

i_pro = tfidf_results(response_I, "i")
they_pro = tfidf_results(response_They, "they")
we_pro = tfidf_results(response_We, "we")
my_pro= tfidf_results(response_My, "my")
us_pro= tfidf_results(response_Us, "us")
you_pro= tfidf_results(response_You, "you")
me_pro = tfidf_results(response_Me, "me")

result = pd.concat([i_pro, they_pro, we_pro, my_pro, us_pro, you_pro, me_pro], axis=1)

column_values = result[["Words-i", "Words-they", "Words-we", "Words-my", "Words-us", "Words-you", "Words-me"]].values
unique_values =  np.unique(column_values)

column_value_count = result[["Words-i", "Words-they", "Words-we", "Words-my", "Words-us", "Words-you", "Words-me"]].value_counts()
unique_values_count =  np.unique(column_value_count)

#print(column_value_count)

display(result)

Unnamed: 0,Words-i,TF-IDF-i,Words-they,TF-IDF-they,Words-we,TF-IDF-we,Words-my,TF-IDF-my,Words-us,TF-IDF-us,Words-you,TF-IDF-you,Words-me,TF-IDF-me
0,people,0.32468,people,0.40021,people,0.337646,theory,0.316204,trump,0.314726,people,0.352042,people,0.317643
1,like,0.269032,trump,0.280419,know,0.278171,people,0.289509,china,0.308907,like,0.259204,like,0.312806
2,think,0.245757,like,0.235196,need,0.272974,like,0.253027,people,0.290366,trump,0.259016,theory,0.251605
3,know,0.239528,medium,0.202516,trump,0.268179,trump,0.184447,medium,0.205079,know,0.237304,tell,0.2012
4,trump,0.233464,lie,0.193435,like,0.204034,time,0.176184,like,0.189887,lie,0.187248,trump,0.197905
5,theory,0.214969,know,0.183484,medium,0.170894,know,0.171036,virus,0.16635,think,0.153412,know,0.189212
6,say,0.151317,china,0.159957,lie,0.16866,think,0.134935,lie,0.164342,stop,0.15215,make,0.18788
7,time,0.150661,think,0.143082,time,0.158643,friend,0.130422,know,0.162024,spreading,0.14833,think,0.146448
8,believe,0.145785,say,0.142919,china,0.144759,say,0.125909,time,0.139878,medium,0.147743,time,0.137475
9,medium,0.136387,want,0.139657,stop,0.135947,virus,0.119426,theory,0.131896,time,0.140823,say,0.13432


In [70]:
# sum tfidf frequency of each term through documents
sums = tfidf_matrix.sum(axis=0)

# connecting term to its sums frequency
data = []
for col, term in enumerate(feature_names):
    if len(term) > 3:
        data.append( (str(term), sums[0,col] ))

df = pd.DataFrame(data, columns=['term','rank'])
ordering = df.sort_values('rank', ascending=False)

In [71]:
print(ordering.head(20))

            term      rank
28423   thousand  2.479840
20104     people  2.297166
29243      trump  2.145664
15889       like  1.514964
8376        dont  1.497655
16920     medium  1.325934
28330     theory  1.265477
4637       china  1.196312
28555       time  1.039813
15281       know  1.015574
26484  spreading  1.006703
31191      virus  0.994339
26907       stop  0.975631
26481     spread  0.843925
17238    million  0.834539
9998        fact  0.792617
18172       need  0.778493
28379      think  0.763301
21417  president  0.687314
23716      right  0.665874


In [107]:
plt_data = [go.Bar(
            x= list(ordering['rank'].head(10)),
            y= list(ordering['term'].head(10)),
            orientation='h',
            marker=dict(
                color='rgba(50, 171, 96, 0.6)',
                line=dict(
                    color='rgba(50, 171, 96, 1.0)',
                    width=1),),)]

plt_layout = ({"title": "TFIDF - English Tweets 03/2020",
                       "yaxis": {"title":"Terms"},
                       "xaxis": {"title":"TFIDF"},
                       "showlegend": False})

fig = go.Figure(data=plt_data, layout=plt_layout)

fig.show()

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [29]:
corpus = {}

corpus_files = file_to_path('../March-No-Retweets', 'txt')

for f in corpus_files:
    with open(f, 'r', encoding="utf-8") as fs:
            text = fs.read()
            corpus[f] = text


In [30]:
# analyze = vectorizer.build_analyzer()

# # for v in corpus.values():
# #     print(analyze(v))

# xy = vectorizer.transform([token_file]).toarray()
# print(xy)

In [31]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)

analyze = bigram_vectorizer.build_analyzer()

# for v in corpus.values():
#     print(analyze(v))

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer()

tf_matrix = tf.fit_transform(corpus.values())

feature_names_2 = tf.get_feature_names()

strong = ["The Quick Brown Fox Jumped Over the Lazy Dog"]

resp = tf.transform([token_file])

for i, j in enumerate(corpus):
        corpus[j]=corpus[j].replace('\n', ' ')

data = {}
words = []
count = []

for col in resp.nonzero()[1]:
        words.append(feature_names_2[col])
        count.append(resp[0, col])

test = {}

print(token_file)

# for x in range(len(words)):
#         data["Word"] = words
#         data["IDF"] = count

# idf = data["IDF"]
# df = pd.DataFrame(data=idf)
# #df.sort_values(by=['IDF'], ascending=True)
# print(df)

