In [1]:
import math
import os

import numpy as np
import pandas as pd

from task3 import create_index

In [2]:
def tf(path_to_docs, vocab, n_docs=100):

    tf_matrix = []

    all_docs = [f"{i}.txt" for i in range(n_docs)]

    for document_number, doc in enumerate(all_docs):
        if document_number % 10 == 0:
            print(f"document {(document_number // 10) * 10}% process")
        arr = np.zeros(len(vocab))
        with open(f"{path_to_docs}/{doc}") as f:
            words_in_document = f.readline().split()
            for i, word in enumerate(vocab):
                arr[i] = words_in_document.count(word)

        tf_matrix.append(list(map(lambda x: x / len(words_in_document), arr)))

    return np.array(tf_matrix)

In [3]:
def idf(inverse_index, n_docs=100):
    idf_per_word_in_vocab = np.zeros(len(inverse_index))
    for i, word in enumerate(inverse_index.keys()):
        idf_per_word_in_vocab[i] = math.log(n_docs / len(inverse_index[word]))
        
    return idf_per_word_in_vocab

In [4]:
def tf_idf(tf, idf):
    
    tf_idf_matrix = np.zeros((tf.shape[1], tf.shape[0]))
    
    for word_i, row in enumerate(tf.T):
        for doc_i, column in enumerate(row):
            tf_idf_matrix[word_i][doc_i] = column * idf[word_i]
            
    return tf_idf_matrix

In [5]:
def to_df(arr, set_of_words):
    df = pd.DataFrame(arr)
    dict_of_indexes = dict()

    for i, word in enumerate(set_of_words):
        dict_of_indexes[i] = word
    
    return df.rename(index=dict_of_indexes)

In [6]:
inverse_index = create_index("normal_form_docs")

set_of_words = list(inverse_index.keys())

tf_result = tf("normal_form_docs", set_of_words)

document 0% process
document 10% process
document 20% process
document 30% process
document 40% process
document 50% process
document 60% process
document 70% process
document 80% process
document 90% process


In [7]:
tf_df = to_df(tf_result.T, set_of_words)
tf_df.to_csv("tf.csv")
tf_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
казанский,0.002743,0.00519,0.007666,0.017625,0.012522,0.004197,0.003186,0.004112,0.008921,0.006966,...,0.004156,0.004288,0.004359,0.003408,0.004587,0.004284,0.004333,0.00446,0.004248,0.004322
приволжский,0.001372,0.00173,0.002555,0.003065,0.008945,0.002398,0.001593,0.001371,0.001784,0.004644,...,0.001663,0.001715,0.001744,0.001363,0.001835,0.001714,0.001733,0.001784,0.001699,0.001729
федеральный,0.002058,0.004325,0.005963,0.007663,0.018784,0.003597,0.003717,0.003427,0.007136,0.008514,...,0.003325,0.004288,0.004359,0.003408,0.004587,0.003428,0.003466,0.00446,0.004248,0.004322
университет,0.009602,0.012111,0.016184,0.034483,0.022361,0.009592,0.006904,0.010966,0.020517,0.012384,...,0.009975,0.010292,0.010462,0.00818,0.011009,0.010283,0.010399,0.010705,0.010195,0.010372
официальный,0.000686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
сайт,0.000686,0.00346,0.001704,0.001533,0.001789,0.001199,0.001062,0.001371,0.001784,0.003096,...,0.002494,0.002573,0.002616,0.002045,0.002752,0.002571,0.0026,0.002676,0.002549,0.002593
2022,0.004801,0.011246,0.000852,0.000766,0.000894,0.0006,0.000531,0.002742,0.007136,0.000774,...,0.000831,0.000858,0.000872,0.002727,0.001835,0.000857,0.003466,0.000892,0.00085,0.000864
год,0.006859,0.007785,0.003407,0.017625,0.016995,0.002398,0.002124,0.006854,0.008029,0.003096,...,0.009975,0.013722,0.011334,0.008862,0.010092,0.01114,0.013865,0.008921,0.010195,0.011236
культурный,0.003429,0.00173,0.000852,0.001533,0.000894,0.0006,0.000531,0.000685,0.000892,0.000774,...,0.000831,0.000858,0.000872,0.000682,0.000917,0.000857,0.000867,0.000892,0.00085,0.000864
наследие,0.002743,0.00173,0.000852,0.001533,0.000894,0.0006,0.000531,0.000685,0.000892,0.000774,...,0.000831,0.000858,0.000872,0.000682,0.000917,0.000857,0.000867,0.000892,0.00085,0.000864


In [8]:
idf_out = idf(inverse_index)
idf_df = pd.DataFrame.from_dict({k: v for k, v in zip(inverse_index.keys(), idf_out)}, orient='index')
idf_df.to_csv("idf.csv")
idf_df.head(10)

Unnamed: 0,0
казанский,0.040822
приволжский,0.051293
федеральный,0.040822
университет,0.040822
официальный,2.207275
сайт,0.083382
2022,0.072571
год,0.051293
культурный,0.083382
наследие,0.072571


In [9]:
tf_idf_out = tf_idf(tf_result, idf_out)
tf_idf_df = to_df(tf_idf_out, set_of_words)
tf_idf_df.to_csv("tf_idf.csv")
tf_idf_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
казанский,0.000112,0.000212,0.000313,0.000719,0.000511,0.000171,0.00013,0.000168,0.000364,0.000284,...,0.00017,0.000175,0.000178,0.000139,0.000187,0.000175,0.000177,0.000182,0.000173,0.000176
приволжский,7e-05,8.9e-05,0.000131,0.000157,0.000459,0.000123,8.2e-05,7e-05,9.2e-05,0.000238,...,8.5e-05,8.8e-05,8.9e-05,7e-05,9.4e-05,8.8e-05,8.9e-05,9.2e-05,8.7e-05,8.9e-05
федеральный,8.4e-05,0.000177,0.000243,0.000313,0.000767,0.000147,0.000152,0.00014,0.000291,0.000348,...,0.000136,0.000175,0.000178,0.000139,0.000187,0.00014,0.000141,0.000182,0.000173,0.000176
университет,0.000392,0.000494,0.000661,0.001408,0.000913,0.000392,0.000282,0.000448,0.000838,0.000506,...,0.000407,0.00042,0.000427,0.000334,0.000449,0.00042,0.000424,0.000437,0.000416,0.000423
официальный,0.001514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003417,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
сайт,5.7e-05,0.000289,0.000142,0.000128,0.000149,0.0001,8.9e-05,0.000114,0.000149,0.000258,...,0.000208,0.000215,0.000218,0.000171,0.000229,0.000214,0.000217,0.000223,0.000213,0.000216
2022,0.000348,0.000816,6.2e-05,5.6e-05,6.5e-05,4.4e-05,3.9e-05,0.000199,0.000518,5.6e-05,...,6e-05,6.2e-05,6.3e-05,0.000198,0.000133,6.2e-05,0.000252,6.5e-05,6.2e-05,6.3e-05
год,0.000352,0.000399,0.000175,0.000904,0.000872,0.000123,0.000109,0.000352,0.000412,0.000159,...,0.000512,0.000704,0.000581,0.000455,0.000518,0.000571,0.000711,0.000458,0.000523,0.000576
культурный,0.000286,0.000144,7.1e-05,0.000128,7.5e-05,5e-05,4.4e-05,5.7e-05,7.4e-05,6.5e-05,...,6.9e-05,7.2e-05,7.3e-05,5.7e-05,7.6e-05,7.1e-05,7.2e-05,7.4e-05,7.1e-05,7.2e-05
наследие,0.000199,0.000126,6.2e-05,0.000111,6.5e-05,4.4e-05,3.9e-05,5e-05,6.5e-05,5.6e-05,...,6e-05,6.2e-05,6.3e-05,4.9e-05,6.7e-05,6.2e-05,6.3e-05,6.5e-05,6.2e-05,6.3e-05
