In [164]:
import nltk
import math
from os import listdir
from os.path import isfile, join

# Return the word term frequency normalized by the maximum frequency of a term in the document and the max frequency fetched in the document
def calculateTermFrequencies(document, prefix = "./"):
    words = {}
    f = open(prefix + document)
    for l in f.readlines():
        words = calculateLineTermFrequencies(l, words)
    f.close()
    max_frequency = sorted(words.items(), key=lambda kv: kv[1], reverse = True)
    mf = max_frequency[0][1]
    for w in words:
        words[w] /= mf
    return words, mf

# Return the frequency of the words in a string and adds it to the words dictinary
def calculateLineTermFrequencies(line, words):
    for token in nltk.word_tokenize(line):
        words[token] = words[token] + 1 if token in words else 1
    return words

# Calculates the inverse document frequency of the terms in a list of documents
def calculateIDFCorpus(corpora, prefix = "./"):
    documents = len(corpora)
    IDF_w = {}
    total_term_frequencies = {}
    total_words = 0
    for d in corpora:
        words, mf = calculateTermFrequencies(d, prefix)
        for w in words:
            IDF_w[w] = IDF_w[w] + 1 if w in IDF_w else 1
            total_term_frequencies[w] = total_term_frequencies[w] + (words[w] * mf) if w in total_term_frequencies else words[w] * mf
            total_words += words[w] * mf
    IDF = {}
    for w in IDF_w:
        IDF[w] = math.log(documents/(1 + IDF_w[w]))
    return IDF, IDF_w, total_term_frequencies, total_words
        

# Calculates the inverse document frequency of a term in the documents found in a system folder
def calculateIDFDirectory(directory, prefix = "./"):
    mypath = prefix + directory if prefix else directory
    files = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    return calculateIDFCorpus(files, mypath + "/")

# Calculates the TF-IDF of a document using a generated IDF or an specific directory
def TFIDF(document, directory="./", IDF = None, prefix="./"):
    df = prefix + directory + "/" if directory else prefix
    tf, mf = calculateTermFrequencies(document, prefix = df)
    TFIDF = {}
    if not IDF:
        IDF, IDF_w, total_term_frequencies, total_words = calculateIDFDirectory(directory, prefix)
    for w in tf:
        TFIDF[w] = tf[w] * IDF[w]
    return TFIDF

In [165]:
line = "Los caminos de la vida no son como yo pensaba, no son como imaginaba, no son como yo creía"
prefix = "/home/mcampos/SoldAI/Repos/sar-machine-learning-toolkit/soldaimltk/classification/data/"

In [166]:
freqs = calculateLineTermFrequencies(line, {})
print(freqs)

{'Los': 1, 'caminos': 1, 'de': 1, 'la': 1, 'vida': 1, 'no': 3, 'son': 3, 'como': 3, 'yo': 2, 'pensaba': 1, ',': 2, 'imaginaba': 1, 'creía': 1}


In [167]:
print(calculateTermFrequencies('test.txt', prefix = prefix))

({'Los': 0.3333333333333333, 'caminos': 0.3333333333333333, 'de': 0.3333333333333333, 'la': 0.3333333333333333, 'vida': 0.3333333333333333, 'no': 1.0, 'son': 1.0, 'como': 1.0, 'yo': 0.6666666666666666, 'pensaba': 0.3333333333333333, ',': 0.6666666666666666, 'imaginaba': 0.3333333333333333, 'creía': 0.3333333333333333}, 3)


In [168]:
IDF, IDF_w, total_term_frequencies, total_words = calculateIDFDirectory("data", "/home/mcampos/SoldAI/Repos/sar-machine-learning-toolkit/soldaimltk/classification/")
print("*" *10 + "IDF" + "*" * 10)
print(IDF)
print("*" *10 + "IDF_w" + "*" * 10)
print(IDF_w)
print("*" *10 + "total_term_frequencies" + "*"*10)
print(total_term_frequencies)
print("*" *10 + "total_words" + "*"*10)
print(total_words)

**********IDF**********
{'Eres': 0.6931471805599453, 'el': 0.6931471805599453, 'caminos': -0.2231435513142097, 'largo': 0.6931471805599453, 'que': 0.6931471805599453, 'un': 0.6931471805599453, 'día': 0.6931471805599453, 'debo': 0.6931471805599453, 'caminar': 0.6931471805599453, ',': -0.2231435513142097, 'como': -0.2231435513142097, 'una': 0.6931471805599453, 'biografía': 0.6931471805599453, 'y': 0.6931471805599453, 'con': 0.6931471805599453, 'cenizas': 0.6931471805599453, 'del': 0.6931471805599453, 'ayer': 0.6931471805599453, 'Los': 0.0, 'de': 0.0, 'la': 0.0, 'vida': 0.0, 'no': 0.0, 'son': 0.0, 'yo': 0.0, 'pensaba': 0.0, 'imaginaba': 0.0, 'creía': 0.0}
**********IDF_w**********
{'Eres': 1, 'el': 1, 'caminos': 4, 'largo': 1, 'que': 1, 'un': 1, 'día': 1, 'debo': 1, 'caminar': 1, ',': 4, 'como': 4, 'una': 1, 'biografía': 1, 'y': 1, 'con': 1, 'cenizas': 1, 'del': 1, 'ayer': 1, 'Los': 3, 'de': 3, 'la': 3, 'vida': 3, 'no': 3, 'son': 3, 'yo': 3, 'pensaba': 3, 'imaginaba': 3, 'creía': 3}
*****

In [169]:
tfidf = TFIDF("test3.txt", directory="data", prefix = "/home/mcampos/SoldAI/Repos/sar-machine-learning-toolkit/soldaimltk/classification/")
print(tfidf)

{'Eres': 0.34657359027997264, 'el': 0.34657359027997264, 'caminos': -0.11157177565710485, 'largo': 0.6931471805599453, 'que': 0.34657359027997264, 'un': 0.34657359027997264, 'día': 0.34657359027997264, 'debo': 0.34657359027997264, 'caminar': 0.34657359027997264, ',': -0.2231435513142097, 'como': -0.11157177565710485, 'una': 0.34657359027997264, 'biografía': 0.34657359027997264, 'y': 0.34657359027997264, 'con': 0.34657359027997264, 'cenizas': 0.34657359027997264, 'del': 0.34657359027997264, 'ayer': 0.34657359027997264}
