In [1]:
import math
import os

import numpy as np
import pandas as pd

from task3 import create_index

In [2]:
def tf(path_to_docs, vocab, n_docs=100):

    tf_matrix = []

    all_docs = os.listdir(path_to_docs)

    for document_number, doc in enumerate(all_docs):
        if document_number % 10 == 0:
            print(f"document {(document_number // 10) * 10}% process")
        arr = np.zeros(len(vocab))
        with open(f"{path_to_docs}/{doc}") as f:
            words_in_document = f.readline().split()
            for i, word in enumerate(vocab):
                arr[i] = words_in_document.count(word)

        tf_matrix.append(list(map(lambda x: x / len(words_in_document), arr)))

    return np.array(tf_matrix)

In [3]:
def idf(inverse_index, n_docs=100):
    idf_per_word_in_vocab = np.zeros(len(inverse_index))
    for i, word in enumerate(inverse_index.keys()):
        idf_per_word_in_vocab[i] = math.log(n_docs / len(inverse_index[word]))
        
    return idf_per_word_in_vocab

In [4]:
def tf_idf(tf, idf):
    
    tf_idf_matrix = np.zeros((tf.shape[1], tf.shape[0]))
    
    for word_i, row in enumerate(tf.T):
        for doc_i, column in enumerate(row):
            tf_idf_matrix[word_i][doc_i] = column * idf[word_i]
            
    return tf_idf_matrix

In [5]:
def to_df(arr, set_of_words):
    df = pd.DataFrame(arr)
    dict_of_indexes = dict()

    for i, word in enumerate(set_of_words):
        dict_of_indexes[i] = word
    
    return df.rename(index=dict_of_indexes)


In [6]:
inverse_index = create_index("normal_form_docs")
set_of_words = list(inverse_index.keys())

tf_result = tf("normal_form_docs", set_of_words)

document 0% process
document 10% process
document 20% process
document 30% process
document 40% process
document 50% process
document 60% process
document 70% process
document 80% process
document 90% process


In [7]:
tf_df = to_df(tf_result.T, set_of_words)
tf_df.to_csv("tf.csv")
tf_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
научный,0.012976,0.00519,0.007605,0.013043,0.004992,0.00805,0.004975,0.005231,0.004332,0.007684,...,0.00499,0.009471,0.015241,0.004604,0.004965,0.007595,0.003965,0.000387,0.004425,0.004644
библиотека,0.022491,0.00173,0.001901,0.001739,0.001664,0.001789,0.002985,0.001744,0.001444,0.001025,...,0.001996,0.015785,0.018628,0.001842,0.001986,0.001688,0.001586,0.0,0.00177,0.001548
они,0.009516,0.00346,0.003802,0.003478,0.00416,0.004472,0.00398,0.004359,0.004332,0.002561,...,0.003992,0.006314,0.011008,0.004604,0.003972,0.004219,0.005551,0.0,0.00354,0.006192
н,0.009516,0.002595,0.002852,0.004348,0.002496,0.002683,0.002985,0.003487,0.002166,0.002049,...,0.002994,0.007893,0.011008,0.003683,0.002979,0.005907,0.003965,0.00232,0.002655,0.002322
и,0.042388,0.038062,0.036122,0.04087,0.031614,0.030411,0.034826,0.043592,0.042599,0.03125,...,0.03493,0.013418,0.04403,0.043278,0.039722,0.035443,0.036479,0.025909,0.038053,0.035604
лобачевский,0.009516,0.002595,0.002852,0.002609,0.002496,0.002683,0.002985,0.002616,0.002166,0.001537,...,0.002994,0.009471,0.011008,0.002762,0.002979,0.005907,0.002379,0.0,0.002655,0.002322
вспомогательный,0.000865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000847,0.0,0.0,0.0,0.0,0.0,0.0,0.0
подразделение,0.00173,0.000865,0.000951,0.00087,0.000832,0.001789,0.00199,0.000872,0.000722,0.000512,...,0.001996,0.0,0.001693,0.001842,0.000993,0.000844,0.000793,0.0,0.000885,0.000774
казанский,0.00519,0.006055,0.003802,0.015652,0.005824,0.012522,0.004975,0.004359,0.005054,0.002561,...,0.00499,0.003157,0.004234,0.009208,0.004965,0.005063,0.011895,0.003094,0.009735,0.006966
приволжский,0.00173,0.002595,0.000951,0.002609,0.002496,0.008945,0.00199,0.001744,0.002166,0.001025,...,0.001996,0.002368,0.000847,0.001842,0.001986,0.001688,0.005551,0.000773,0.00177,0.004644


In [8]:
idf_out = idf(inverse_index)
idf_df = pd.DataFrame.from_dict({k: v for k, v in zip(inverse_index.keys(), idf_out)}, orient='index')
idf_df.to_csv("idf.csv")
idf_df.head(10)

Unnamed: 0,0
научный,0.051293
библиотека,0.094311
они,0.061875
н,0.020203
и,0.040822
лобачевский,0.083382
вспомогательный,1.89712
подразделение,0.105361
казанский,0.040822
приволжский,0.051293


In [9]:
tf_idf_out = tf_idf(tf_result, idf_out)
tf_idf_df = to_df(tf_idf_out, set_of_words)
idf_df.to_csv("tf_idf.csv")
tf_idf_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
научный,0.000666,0.000266,0.00039,0.000669,0.000256,0.000413,0.000255,0.000268,0.000222,0.000394,...,0.000256,0.000486,0.000782,0.000236,0.000255,0.00039,0.000203,2e-05,0.000227,0.000238
библиотека,0.002121,0.000163,0.000179,0.000164,0.000157,0.000169,0.000282,0.000164,0.000136,9.7e-05,...,0.000188,0.001489,0.001757,0.000174,0.000187,0.000159,0.00015,0.0,0.000167,0.000146
они,0.000589,0.000214,0.000235,0.000215,0.000257,0.000277,0.000246,0.00027,0.000268,0.000158,...,0.000247,0.000391,0.000681,0.000285,0.000246,0.000261,0.000343,0.0,0.000219,0.000383
н,0.000192,5.2e-05,5.8e-05,8.8e-05,5e-05,5.4e-05,6e-05,7e-05,4.4e-05,4.1e-05,...,6e-05,0.000159,0.000222,7.4e-05,6e-05,0.000119,8e-05,4.7e-05,5.4e-05,4.7e-05
и,0.00173,0.001554,0.001475,0.001668,0.001291,0.001241,0.001422,0.00178,0.001739,0.001276,...,0.001426,0.000548,0.001797,0.001767,0.001622,0.001447,0.001489,0.001058,0.001553,0.001453
лобачевский,0.000793,0.000216,0.000238,0.000218,0.000208,0.000224,0.000249,0.000218,0.000181,0.000128,...,0.00025,0.00079,0.000918,0.00023,0.000248,0.000493,0.000198,0.0,0.000221,0.000194
вспомогательный,0.001641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.001606,0.0,0.0,0.0,0.0,0.0,0.0,0.0
подразделение,0.000182,9.1e-05,0.0001,9.2e-05,8.8e-05,0.000188,0.00021,9.2e-05,7.6e-05,5.4e-05,...,0.00021,0.0,0.000178,0.000194,0.000105,8.9e-05,8.4e-05,0.0,9.3e-05,8.2e-05
казанский,0.000212,0.000247,0.000155,0.000639,0.000238,0.000511,0.000203,0.000178,0.000206,0.000105,...,0.000204,0.000129,0.000173,0.000376,0.000203,0.000207,0.000486,0.000126,0.000397,0.000284
приволжский,8.9e-05,0.000133,4.9e-05,0.000134,0.000128,0.000459,0.000102,8.9e-05,0.000111,5.3e-05,...,0.000102,0.000121,4.3e-05,9.4e-05,0.000102,8.7e-05,0.000285,4e-05,9.1e-05,0.000238
