In [1]:
import time
from measure_time import measure_time

import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
import nltk
nltk.download("stopwords")
#--------#
from string import punctuation

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    words = re.split(r'[^а-яА-Я]', text) # разбиваем текст на слова
    tokens = list()
    for word in words:
        p = morph.parse(word)[0]
        if p.tag.POS not in ['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'NUMR']:
            tokens.append(p.normal_form)
    tokens = [token for token in tokens if token not in russian_stopwords\
                and token != " " \
                and token.strip() not in punctuation]
    company_tokens_text_len = len(tokens)

    return " ".join(tokens), company_tokens_text_len

def preprocess_text_in_list(list_):
    prepr_list = []
    for comment in list_:
        prepr_comment = preprocess_text(comment)
        prepr_list.append(prepr_comment)
    return prepr_list

def unpack_comments(comments_list_str):
    if comments_list_str[-2:] != "']" and comments_list_str[-2:] != "\"]" and comments_list_str != "[]":
        last_ap_ind = comments_list_str.rfind("'")
        comments_list_str = comments_list_str[:last_ap_ind-2] + "]"
    return comments_list_str

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baltt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os

groups_files = os.listdir('Collected')

groups_files

['Бонч_основная_all_posts_07_02_24.xlsx',
 'Бонч_подслушано_all_posts_07_02_24.xlsx',
 'Бонч_поступление_all_posts_07_02_24.xlsx',
 'ВШЭ_основная_all_posts_07_02_24.xlsx',
 'ВШЭ_подслушано_all_posts_07_02_24.xlsx',
 'ВШЭ_поступление_all_posts_07_02_24.xlsx',
 'ИТМО_основная_all_posts_07_02_24.xlsx',
 'ИТМО_подслушано_all_posts_07_02_24.xlsx',
 'ИТМО_поступление_all_posts_07_02_24.xlsx',
 'ЛЭТИ_основная_all_posts_07_02_24.xlsx',
 'ЛЭТИ_подслушано_all_posts_07_02_24.xlsx',
 'ЛЭТИ_постулпения_all_posts_07_02_24.xlsx',
 'Политех_основная_all_posts_07_02_24.xlsx',
 'Политех_подслушано_all_posts_07_02_24.xlsx',
 'Политех_поступление_all_posts_07_02_24.xlsx']

In [None]:
import pandas as pd

years_included = list(map(str, range(2019, 2024)))

group_df = pd.read_excel(f'Collected/{groups_files[0]}')
group_df[group_df['Дата публикации и время поста'].str[:4].isin(years_included)]

text_group_df = group_df[['Текст_поста', "Текст_комментариев"]]

Unnamed: 0.1,Unnamed: 0,Текст поста,Дата публикации и время поста,Число лайков,Число комментариев,Число просмотров
99,99,"2024 год на пороге! С праздником, «Бонч»! 🧡\n\...",2023-12-31 09:16:00,119,3,3861
100,100,Как «Бонч» готовится к встрече Нового года? Уз...,2023-12-30 17:37:00,315,13,8165
101,101,"Чудеса случаются там, где их ждут ✨\n\n[club41...",2023-12-29 13:05:00,120,5,8750
102,102,К 100-летию телевидения: как все начиналось 📺\...,2023-12-29 11:21:00,19,0,2176
103,103,Режим работы в новогодние праздники ❄\n\nДорог...,2023-12-29 10:05:00,20,0,2870
...,...,...,...,...,...,...
4892,4892,БЛАГОДАРНОСТЬ В АДРЕС СПбГУТ \n\nВ адрес Санкт...,2019-01-10 11:58:30,4,0,541
4893,4893,КУБОК РОССИИ ПО РАДИОСПОРТУ\n\nКубок России по...,2019-01-10 10:47:43,14,0,748
4894,4894,О СТОИМОСТИ ПРОЖИВАНИЯ В ОБЩЕЖИТИЯХ СПбГУТ \n\...,2019-01-09 14:41:31,3,0,1896
4895,4895,"ДРУЗЬЯ, ПОЗДРАВЛЯЕМ ВАС С НАСТУПИВШИМИ ПРАЗДНИ...",2019-01-09 13:13:47,5,0,561


In [28]:
import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

os.makedirs('Preprocessed', exist_ok = True)

years_included = list(map(str, range(2019, 2024)))

for group in groups_files:
    st = time.time()
    print(f'Working on {group}')
    group_df = pd.read_excel(f'Collected/{group}')

    group_df = group_df[group_df['Дата публикации и время поста'].str[:4].isin(years_included)]
    
    text_group_df = group_df[['Текст поста']]

    text_group_df_preprocessed = text_group_df['Текст поста'].astype(str).apply(preprocess_text)
    text_group_df_preprocessed = pd.DataFrame(list(text_group_df_preprocessed), columns = ['text_tokens', 'text_length'])
    prepr_df = pd.concat([group_df, text_group_df_preprocessed], axis=1, join='inner')

    prepr_df[['text_tokens', 'text_length']].to_csv(f'Preprocessed/{group[:-5]}.csv')
    measure_time(st)

Working on Бонч_основная_all_posts_07_02_24.xlsx
Execution time: 0 hours 2 minutes 17 seconds
Working on Бонч_подслушано_all_posts_07_02_24.xlsx
Execution time: 0 hours 0 minutes 23 seconds
Working on Бонч_поступление_all_posts_07_02_24.xlsx
Execution time: 0 hours 0 minutes 34 seconds
Working on ВШЭ_основная_all_posts_07_02_24.xlsx
Execution time: 0 hours 1 minutes 25 seconds
Working on ВШЭ_подслушано_all_posts_07_02_24.xlsx
Execution time: 0 hours 2 minutes 48 seconds
Working on ВШЭ_поступление_all_posts_07_02_24.xlsx
Execution time: 0 hours 0 minutes 28 seconds
Working on ИТМО_основная_all_posts_07_02_24.xlsx
Execution time: 0 hours 0 minutes 54 seconds
Working on ИТМО_подслушано_all_posts_07_02_24.xlsx
Execution time: 0 hours 1 minutes 31 seconds
Working on ИТМО_поступление_all_posts_07_02_24.xlsx
Execution time: 0 hours 0 minutes 17 seconds
Working on ЛЭТИ_основная_all_posts_07_02_24.xlsx
Execution time: 0 hours 1 minutes 47 seconds
Working on ЛЭТИ_подслушано_all_posts_07_02_24.xl