In [1]:
%%file TFIDF.py

from mrjob.job import MRJob
import re
import numpy as np
import pandas as pd

WORD_RE = re.compile(r"[\w']+")

#!/usr/bin/env python3
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.compat import jobconf_from_env
from mrjob.protocol import RawValueProtocol

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from math import log

import sys
import os

import re

# Splits at words
WORD_RE = re.compile(r"[\w']+")

data = pd.read_csv('data/tokens.csv', usecols=[1], names=['i', 'tokens'])
unique_tokens = np.unique(np.concatenate([str(line).split() for line in data['tokens']]))
# Word 2 index dictionary
word2idxDict = dict(zip(unique_tokens, np.arange(len(unique_tokens))))

NUMBER_OF_DOCUMENTS = 5572
NUMBER_OF_UNIQUE_TOKENS = len(unique_tokens)
TFIDF = np.zeros((NUMBER_OF_DOCUMENTS, NUMBER_OF_UNIQUE_TOKENS))

class MRTFIDF(MRJob):
    
    # Mapper1: 
    # Assigns value 1 to word-document pairs. 
    def get_words_from_line(self, _, line):
        """
        Maps to: 
        
        Key: tuple: (word, document)
        Value: 1
        """
        # splitting docname from text
        line = line.split(',')
        docname, line = line[0], ' '.join(line[1:])
        
        # Loop through words in line
        for term in WORD_RE.findall(line):
            # Key-Value pair
            yield (term, docname), 1

    # Reducer1: 
    # Reduces values of identical keys (word-doc pais) by summing
    def term_frequency_per_doc(self, term_doc, occurences):
        """
        Reduces: values to sum of values
        """
        term, docname = term_doc[0], term_doc[1]
        # summing occurences of terms in each term-doc pair
        yield (term, docname), sum(occurences)

    # Mapper2: 
    # Maps all keys of documents to a list of terms and their frequencies
    def get_docs_tf(self, term_doc, freq):
        """
        Maps to: 
        
        Key: document
        Value: tuple: (term, frequency)
        """
        term, doc = term_doc[0], term_doc[1]
        yield doc, (term, freq)

    # Reducer2: 
    # Word-document pairs as keys, 
    # assigns values (frequency of word in doc, total word_count in doc)
    def number_of_terms_per_each_doc(self, doc, term_freqs):
        """
        Key: tuple: (term, doc)
        Value: tuple: (frequncy of term, total document wordcount)
        """        
        terms = []
        freqs = []
        terms_in_doc = 0
        for term_freq in term_freqs:
            term, freq = term_freq[0], term_freq[1]
            terms.append(term)
            freqs.append(freq)
            terms_in_doc += freq

        for i in range(len(terms)):
            yield (terms[i], doc), (freqs[i], terms_in_doc)

    # Mapper3: 
    # Maps all keys of words to values of tuple(doc, frequency, total number of words)
    def get_terms_per_corpus(self, term_doc, freq_docWords):
        """
        Maps to: 
        
        Key: term
        Value: tuple: (doc, frequency, total number of words in doc)
        """        
        term, doc = term_doc[0], term_doc[1]
        freq, terms_in_doc = freq_docWords[0], freq_docWords[1]
        yield term, (doc, freq, terms_in_doc)

    # Reducer3: 
    # Get all term-doc keys, use values (word_frequency, total_words in doc, number of docs containing term)
    def term_appearence_in_corpus(self, term, doc_freq_nwords):
        """
        Key: tuple: (term, doc)
        Value: tuple: (frequncy of term, total number of words in doc, total number of docs containing term)
        """
        docs_containing_term = 0
        docs = []
        freqs = []
        terms_in_docs = []
        
        # Creating lists term-doc pair
        for dfn in doc_freq_nwords:
            docs_containing_term += 1
            docs.append(dfn[0])
            freqs.append(dfn[1])
            terms_in_docs.append(dfn[2])

        for i in range(len(docs)):
            yield (term, docs[i]), (freqs[i], terms_in_docs[i], docs_containing_term)

    # Mapper4
    # Maps the calculated tfidf score based on 
    # frequncy of term, total number of words in doc, total number of docs containing term
    def calculate_tf_idf(self, term_doc, tf_n_df):
        """
        Key: tuple: (term, doc)
        Value: TFIDF-value
        """
        term, doc = term_doc[0], term_doc[1]
        freqs, terms_in_doc, docs_containing_term = tf_n_df[0], tf_n_df[1], tf_n_df[2]
        
        # Calculating TF and IDF
        TF = (freqs / terms_in_doc)
        IDF = log(NUMBER_OF_DOCUMENTS / docs_containing_term)
        
        # Calculating actual TFIDF. 
        tfidf = TF * IDF
        
        # Accessing word index
        wordIdx = word2idxDict[term]
        
        # Inputting TFIDF in matrix
        TFIDF[int(doc), wordIdx] = tfidf
        
        yield (term, doc), tfidf
        

    def steps(self):
        return [
            MRStep(
                mapper=self.get_words_from_line,
                reducer=self.term_frequency_per_doc,
            ),
            MRStep(
                mapper=self.get_docs_tf,
                reducer=self.number_of_terms_per_each_doc,
            ),
            MRStep(
                mapper=self.get_terms_per_corpus,
                reducer=self.term_appearence_in_corpus,
            ),
            MRStep(
                mapper=self.calculate_tf_idf,
            ),
        ]

if __name__ == '__main__':
    MRTFIDF.run()
    TFIDF.tofile('data/TFIDF.dat')

Writing TFIDF.py


In [2]:
!python3 TFIDF.py data/tokens.csv

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /var/folders/mt/m4qdkk9s0ql0tt_xd7yqltwc0000gn/T/TFIDF.philliphoejbjerg.20221108.190200.985873
Running step 1 of 4...
Running step 2 of 4...
Running step 3 of 4...
Running step 4 of 4...
job output is in /var/folders/mt/m4qdkk9s0ql0tt_xd7yqltwc0000gn/T/TFIDF.philliphoejbjerg.20221108.190200.985873/output
Streaming final output from /var/folders/mt/m4qdkk9s0ql0tt_xd7yqltwc0000gn/T/TFIDF.philliphoejbjerg.20221108.190200.985873/output...
["court", "4938"]	0.8625509334899697
["courtroom", "2793"]	0.45397417552103664
["cousin", "2200"]	2.6441207181132502
["cousin", "4008"]	0.6610301795283126
["cover", "4711"]	0.7903655302382063
["cover", "5221"]	0.5269103534921376
["cover", "3338"]	1.2645848483811302
["cover", "3425"]	0.7025471379895167
["cover", "3496"]	0.5269103534921376
["cover", "2162"]	0.3719367201120971
["cover", "2786"]	0.31614621209528254
["cover", "1372"]	0.3719367201

["msg", "955"]	0.25447702770619723
["msg", "993"]	0.22620180240550863
["msg", "42"]	0.25447702770619723
["msg", "4234"]	0.2714421628866104
["msg", "3531"]	0.5089540554123945
["msg", "3545"]	0.25447702770619723
["msg", "3585"]	0.22620180240550863
["msg", "3616"]	0.5089540554123945
["msg", "372"]	0.581661777614165
["msg", "2672"]	0.8143264886598311
["msg", "2704"]	0.2142964443841661
["msg", "2737"]	0.23950779078230328
["msg", "2766"]	0.23950779078230328
["msg", "1270"]	0.339302703608263
["msg", "1372"]	0.23950779078230328
["msg", "138"]	0.2142964443841661
["msg", "1384"]	0.13134298204190825
["msg", "1401"]	0.23950779078230328
["msg", "2232"]	0.581661777614165
["msg", "2326"]	0.22620180240550863
["msg", "2343"]	0.23950779078230328
["msg", "2366"]	0.22620180240550863
["msg", "2369"]	0.07020055936722683
["msg", "2582"]	0.339302703608263
["msg", "1081"]	0.8143264886598311
["msg", "1169"]	0.23950779078230328
["msg", "2957"]	0.22620180240550863
["msg", "3000"]	0.3132

["wait", "5158"]	0.43021323819925905
["wait", "5474"]	0.3871919143793332
["wait", "548"]	0.9679797859483329
["wait", "5504"]	0.7743838287586664
["wait", "5564"]	0.7743838287586664
["wait", "570"]	0.21510661909962953
["wait", "573"]	1.9359595718966658
["wait", "574"]	0.9679797859483329
["wait", "577"]	0.9679797859483329
["wait", "5297"]	0.3871919143793332
["wait", "5312"]	0.21510661909962953
["wait", "5329"]	1.290639714597777
["wait", "5346"]	0.3871919143793332
["wait", "5354"]	0.2765656531280951
["wait", "5359"]	0.2765656531280951
["wait", "5366"]	0.32265992864944426
["wait", "5368"]	0.2978399341379486
["wait", "4775"]	0.5531313062561902
["wait", "483"]	0.5531313062561902
["wait", "4835"]	0.7743838287586664
["wait", "4917"]	0.48398989297416645
["wait", "4929"]	0.2978399341379486
["wait", "3298"]	0.1935959571896666
["wait", "3365"]	1.290639714597777
["wait", "3389"]	0.6453198572988885
["wait", "3454"]	0.16132996432472213
["wait", "3461"]	0.7743838287586664
["w

["eye", "1792"]	0.5117168904259746
["eye", "1967"]	0.8772289550159564
["eye", "744"]	1.0234337808519491
["eye", "4185"]	0.6822891872346328
["eye", "3553"]	3.070301342555848
["eye", "3584"]	0.24562410740446783
["brisk", "4530"]	0.4107385397571284
["brison", "4287"]	0.8625509334899697
["bristol", "323"]	0.49577263464623444
["bristol", "1054"]	0.37773153115903574
["british", "1627"]	0.6610301795283126
["british", "4018"]	0.5665972967385536
["britney", "899"]	0.6610301795283126
["britney", "95"]	0.6610301795283126
["bro", "3414"]	3.619607486889903
["bro", "3802"]	0.6581104521618005
["bro", "4078"]	1.2065358289633008
["bro", "2664"]	3.619607486889903
["broad", "3265"]	0.7932362154339752
["broad", "4823"]	0.6101817041799809
["broadband", "3537"]	0.7841372122636088
["broke", "5096"]	0.7795634913850662
["broke", "4156"]	0.7795634913850662
["broke", "4263"]	0.7795634913850662
["broke", "2722"]	0.7795634913850662
["broke", "1302"]	0.36926691697187347
["broken", "474"]	0

["right", "1303"]	0.5925960743459271
["right", "1304"]	2.074086260210745
["right", "1351"]	0.829634504084298
["right", "2234"]	0.4609080578246099
["right", "2265"]	0.5925960743459271
["right", "230"]	0.553089669389532
["right", "2399"]	1.0370431301053724
["right", "2411"]	1.3827241734738298
["right", "2413"]	0.29629803717296355
["right", "2433"]	0.05682428110166424
["right", "2617"]	0.5925960743459271
["right", "103"]	0.21832486949586785
["right", "1169"]	0.24401014826008763
["right", "1187"]	0.5925960743459271
["right", "2932"]	0.24401014826008763
["right", "2944"]	0.12963039126317155
["right", "298"]	0.5925960743459271
["right", "2983"]	0.34568104336845745
["right", "3048"]	0.4609080578246099
["rightio", "2439"]	1.4375848891499494
["rightli", "5048"]	0.6101817041799809
["rightli", "1038"]	0.6101817041799809
["riley", "3001"]	2.156377333724924
["rimac", "2602"]	1.7251018669799394
["ring", "4549"]	0.32555687538478706
["ring", "3090"]	0.30747038230785445
["ring

["shore", "3261"]	2.6441207181132502
["shore", "3039"]	0.22034339317610419
["short", "3110"]	1.5807310604764127
["short", "3267"]	1.2645848483811302
["short", "4462"]	0.5269103534921376
["short", "3793"]	1.5807310604764127
["short", "3881"]	1.5807310604764127
["short", "3910"]	0.7903655302382063
["short", "4079"]	0.20396529812598874
["short", "627"]	0.4516374458504036
["short", "64"]	0.35127356899475837
["short", "887"]	0.5748112947186955
["shortag", "5105"]	0.3961524761174519
["shortag", "1983"]	0.3961524761174519
["shortag", "429"]	0.3961524761174519
["shortcod", "3383"]	0.6635007180692074
["shorter", "2220"]	0.6272414205192989
["shorter", "1985"]	0.44275864977832863
["shorter", "744"]	1.2544828410385978
["shortli", "1517"]	0.42583617492822384
["shortli", "3983"]	0.6581104521618005
["shortli", "4014"]	1.0341735676828294
["shortli", "2861"]	1.8098037434449514
["shot", "4912"]	0.44275864977832863
["shot", "2826"]	0.44275864977832863
["shot", "1382"]	0.752689704

In [3]:
import os
import pandas as pd
import numpy as np

tokens = pd.read_csv('data/tokens.csv', usecols=[1], names=['i', 'tokens'])
unique_tokens = np.unique(np.concatenate([str(line).split() for line in tokens['tokens']]))
# saving number of docs and tokens
NUMBER_OF_DOCUMENTS, NUMBER_OF_UNIQUE_TOKENS = 5572, len(unique_tokens)

# loading TFIDF file
TFIDF = np.fromfile('data/TFIDF.dat', dtype=float)
os.remove('data/TFIDF.dat')

# reshaping
TFIDF = np.reshape(TFIDF, (NUMBER_OF_DOCUMENTS, NUMBER_OF_UNIQUE_TOKENS))

In [4]:
TFIDF = pd.DataFrame(TFIDF)
TFIDF.columns = unique_tokens

TFIDF.to_csv('data/TFIDF.csv')