In [29]:
from google.colab import drive
drive.mount('/content/drive')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import json
import glob

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. Загрузка данных из текстовых файлов, находящихся в каталоге docs

In [30]:
file_list = glob.glob('/'.join(['drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs', '*']))
docs = []
for file_name in file_list:
    print(file_name)

    f = open(file_name)
    result = f.read()
    docs.append(result)

drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Douglas Adams - Hitchhikers Trilogy - Hitchhikers Guide to the Galaxy.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Douglas Adams - Hitchhikers Trilogy - So Long, and Thanks for All the Fish.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Douglas Adams - Hitchhikers Trilogy - Restaurant End of the Universe.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Jane Austen - Northanger Abbey.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Douglas Adams - The Long Dark Tea Time of the Soul.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Douglas Adams - Hitchhikers Trilogy - Mostly Harmless.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Jane Austen - Pride and Prejudice.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Jane Austen - Persuasion.txt
drive/MyDrive/Colab Notebooks/Internship/lesson_13/docs/Edwin Arnold - Guliver of Mars.txt
drive/MyDrive/Colab 

### 2. Вычисление статистической меры словарной частоты (Tf-idf) для всей выборки из текстовых документов

In [31]:
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
count = vectorizer.fit_transform(docs)
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())

[[0.         0.00038137 0.         ... 0.         0.0019267  0.00054596]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.01541767 0.         ... 0.         0.         0.        ]]


### 3. Возвращение словаря признаков (слов) с их индексами, упорядочивание его по возрастанию индекса и сохранение в JSON-файл

In [32]:
bag = vectorizer.fit_transform(docs)
print(vectorizer.vocabulary_)
print(bag.toarray())

[[0 1 0 ... 0 3 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 4 0 ... 0 0 0]]


In [33]:
# упорядочивание словаря признаков по возрастанию индекса
sorted_tuple = sorted(vectorizer.vocabulary_.items(), key=lambda i: i[1])
sorted_vocabulary = dict(sorted_tuple)
print(sorted_vocabulary)



In [34]:
# сохранение словаря в JSON-файл
with open('drive/MyDrive/Colab Notebooks/Internship/lesson_13/sorted_vocabulary.json', 'w') as my_file:
    json.dump(sorted_vocabulary, my_file, indent=4)

### 4. Создание Pandas датафрейма, индексы в котором – слова из сформированного словаря, а значения – соответствующие элементы Tf-idf-матрицы


In [35]:
# транспонирование и уплотнение матрицы
new_matrix = tfidf_matrix.T.todense()
print(new_matrix)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.00038137 0.         0.         ... 0.         0.         0.01541767]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.0019267  0.         0.         ... 0.         0.         0.        ]
 [0.00054596 0.         0.         ... 0.         0.         0.        ]]


In [36]:
# создание векторизатора словаря
dictvectorizer = DictVectorizer(sparse=False)

# конвертирование словаря в матрицу признаков
features = dictvectorizer.fit_transform(sorted_vocabulary)
feature_names = dictvectorizer.get_feature_names_out()
name_columns = ['doc_{}'.format(i) for i in range(1, 11)]

# итоговый Pandas датафрейм
pd.DataFrame(new_matrix, index = feature_names, columns = name_columns)

Unnamed: 0,doc_1,doc_2,doc_3,doc_4,doc_5,doc_6,doc_7,doc_8,doc_9,doc_10
00,0.000000,0.0,0.000000,0.0,0.000380,0.000000,0.000000,0.0,0.0,0.000000
000,0.000381,0.0,0.000000,0.0,0.000226,0.000536,0.001311,0.0,0.0,0.015418
004,0.000000,0.0,0.000000,0.0,0.000000,0.000451,0.000000,0.0,0.0,0.000000
03758,0.000642,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
040700,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.006491
...,...,...,...,...,...,...,...,...,...,...
zoom,0.000546,0.0,0.000000,0.0,0.000000,0.000384,0.000000,0.0,0.0,0.000000
zowee,0.000000,0.0,0.000535,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
zwingler,0.000000,0.0,0.000000,0.0,0.000000,0.000902,0.000000,0.0,0.0,0.000000
zz,0.001927,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000
