In [1]:
import time
from measure_time import measure_time

import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
import nltk
nltk.download("stopwords")
#--------#
from string import punctuation

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    words = re.split(r'[^а-яА-Я]', text) # разбиваем текст на слова
    tokens = list()
    for word in words:
        p = morph.parse(word)[0]
        if p.tag.POS not in ['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'NUMR']:
            tokens.append(p.normal_form)
    tokens = [token for token in tokens if token not in russian_stopwords\
                and token != " " \
                and token.strip() not in punctuation]
    company_tokens_text_len = len(tokens)

    return " ".join(tokens), company_tokens_text_len

def preprocess_text_in_list(list_):
    prepr_list = []
    for comment in list_:
        prepr_comment = preprocess_text(comment)
        prepr_list.append(prepr_comment)
    return prepr_list

def unpack_comments(comments_list_str):
    if comments_list_str[-2:] != "']" and comments_list_str[-2:] != "\"]" and comments_list_str != "[]":
        last_ap_ind = comments_list_str.rfind("'")
        comments_list_str = comments_list_str[:last_ap_ind-2] + "]"
    return comments_list_str

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baltt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import os

groups_files = os.listdir('Collected')

groups_files

['Бонч_мемы_all_posts_31_07_2023.xlsx',
 'Бонч_основная_all_posts_31_07_2023.xlsx',
 'Бонч_подслушано_all_posts_31_07_2023.xlsx',
 'Бонч_поступление_all_posts_31_07_2023.xlsx',
 'ВШЭ_мемы_all_posts_31_07_2023.xlsx',
 'ВШЭ_основная_all_posts_31_07_2023.xlsx',
 'ВШЭ_поступление_all_posts_31_07_2023.xlsx',
 'ИТМО_мемы_all_posts_31_07_2023.xlsx',
 'ИТМО_основная_all_posts_31_07_2023.xlsx',
 'ИТМО_подслушано_all_posts_31_07_2023.xlsx',
 'ИТМО_поступление_all_posts_31_07_2023 — копия.xlsx',
 'Мечникова_мемы_all_posts_31_07_2023.xlsx',
 'Мечникова_основная_all_posts_31_07_2023.xlsx',
 'Мечникова_подслушано_all_posts_31_07_2023.xlsx',
 'Мечникова_профком_all_posts_31_07_2023.xlsx',
 'Политех_мемы_all_posts_31_07_2023.xlsx',
 'Политех_основная_all_posts_31_07_2023.xlsx',
 'Политех_подслушано_all_posts_31_07_2023.xlsx',
 'Политех_поступление_all_posts_31_07_2023.xlsx']

In [4]:
import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

os.makedirs('Preprocessed', exist_ok = True)
for group in groups_files:
    st = time.time()
    print(f'Working on {group}')
    group_df = pd.read_excel(f'Collected/{group}')

    text_group_df = group_df[['Текст_поста', "Текст_комментариев"]]

    text_group_df_preprocessed = text_group_df['Текст_поста'].astype(str).apply(preprocess_text)
    text_group_df_preprocessed = pd.DataFrame(list(text_group_df_preprocessed), columns = ['text_tokens', 'text_length'])
    prepr_df = pd.concat([group_df, text_group_df_preprocessed], axis=1, join='inner')

    prepr_df['preprocessed_comments'] = text_group_df["Текст_комментариев"].apply(unpack_comments).apply(literal_eval).apply(preprocess_text_in_list)

    prepr_df.to_csv(f'Preprocessed/{group[:-5]}.csv')
    measure_time(st)

Working on Бонч_мемы_all_posts_31_07_2023.xlsx
Execution time: 0 hours 0 minutes 2 seconds
Working on Бонч_основная_all_posts_31_07_2023.xlsx
Execution time: 0 hours 1 minutes 56 seconds
Working on Бонч_подслушано_all_posts_31_07_2023.xlsx
Execution time: 0 hours 7 minutes 27 seconds
Working on Бонч_поступление_all_posts_31_07_2023.xlsx
Execution time: 0 hours 1 minutes 7 seconds
Working on ВШЭ_мемы_all_posts_31_07_2023.xlsx
Execution time: 0 hours 1 minutes 15 seconds
Working on ВШЭ_основная_all_posts_31_07_2023.xlsx
Execution time: 0 hours 4 minutes 35 seconds
Working on ВШЭ_поступление_all_posts_31_07_2023.xlsx
Execution time: 0 hours 3 minutes 42 seconds
Working on ИТМО_мемы_all_posts_31_07_2023.xlsx
Execution time: 0 hours 0 minutes 11 seconds
Working on ИТМО_основная_all_posts_31_07_2023.xlsx
Execution time: 0 hours 3 minutes 40 seconds
Working on ИТМО_подслушано_all_posts_31_07_2023.xlsx
Execution time: 0 hours 9 minutes 44 seconds
Working on ИТМО_поступление_all_posts_31_07_202