In [1]:
import time
from measure_time import measure_time

import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
import nltk
nltk.download("stopwords")
#--------#
from string import punctuation

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

def preprocess_text(text):
    words = re.split(r'[^–∞-—è–ê-–Ø]', text) # —Ä–∞–∑–±–∏–≤–∞–µ–º —Ç–µ–∫—Å—Ç –Ω–∞ —Å–ª–æ–≤–∞
    tokens = list()
    for word in words:
        p = morph.parse(word)[0]
        if p.tag.POS not in ['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ', 'NUMR']:
            tokens.append(p.normal_form)
    tokens = [token for token in tokens if token not in russian_stopwords\
                and token != " " \
                and token.strip() not in punctuation]
    company_tokens_text_len = len(tokens)

    return " ".join(tokens), company_tokens_text_len

def preprocess_text_in_list(list_):
    prepr_list = []
    for comment in list_:
        prepr_comment = preprocess_text(comment)
        prepr_list.append(prepr_comment)
    return prepr_list

def unpack_comments(comments_list_str):
    if comments_list_str[-2:] != "']" and comments_list_str[-2:] != "\"]" and comments_list_str != "[]":
        last_ap_ind = comments_list_str.rfind("'")
        comments_list_str = comments_list_str[:last_ap_ind-2] + "]"
    return comments_list_str

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\baltt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os

groups_files = os.listdir('Collected')

groups_files

['–ë–æ–Ω—á_–º–µ–º—ã_all_posts_31_07_2023.xlsx',
 '–ë–æ–Ω—á_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx',
 '–ë–æ–Ω—á_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ_all_posts_31_07_2023.xlsx',
 '–ë–æ–Ω—á_–ø–æ—Å—Ç—É–ø–ª–µ–Ω–∏–µ_all_posts_31_07_2023.xlsx',
 '–í–®–≠_–º–µ–º—ã_all_posts_31_07_2023.xlsx',
 '–í–®–≠_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx',
 '–í–®–≠_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ.xlsx',
 '–í–®–≠_–ø–æ—Å—Ç—É–ø–ª–µ–Ω–∏–µ_all_posts_31_07_2023.xlsx',
 '–ì–£–ê–ü_–º–µ–º—ã_all_posts_31_07_2023.xlsx',
 '–ì–£–ê–ü_–æ—Å–Ω_all_posts_31_07_2023.xlsx',
 '–ì–£–ê–ü_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ_all_posts_31_07_2023.xlsx',
 '–ì–£–ê–ü_–ø–æ—Å—Ç—É–ø_all_posts_31_07_2023.xlsx',
 '–ò–¢–ú–û_–º–µ–º—ã_all_posts_31_07_2023.xlsx',
 '–ò–¢–ú–û_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx',
 '–ò–¢–ú–û_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ_all_posts_31_07_2023.xlsx',
 '–ò–¢–ú–û_–ø–æ—Å—Ç—É–ø–ª–µ–Ω–∏–µ_all_posts_31_07_2023 ‚Äî –∫–æ–ø–∏—è.xlsx',
 '–õ–≠–¢–ò_–º–µ–º—ã_all_posts_31_07_2023.xlsx',
 '–õ–≠–¢–ò_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx',
 '–õ–≠–¢–ò_–ø–æ–¥—Å–ª—É—à–

In [3]:
import pandas as pd
from ast import literal_eval #used for extracting comments (lists stored as strings)

os.makedirs('Preprocessed', exist_ok = True)
for group in groups_files:
    st = time.time()
    print(f'Working on {group}')
    group_df = pd.read_excel(f'Collected/{group}')

    text_group_df = group_df[['–¢–µ–∫—Å—Ç_–ø–æ—Å—Ç–∞', "–¢–µ–∫—Å—Ç_–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤"]]

    text_group_df_preprocessed = text_group_df['–¢–µ–∫—Å—Ç_–ø–æ—Å—Ç–∞'].astype(str).apply(preprocess_text)
    text_group_df_preprocessed = pd.DataFrame(list(text_group_df_preprocessed), columns = ['text_tokens', 'text_length'])
    prepr_df = pd.concat([group_df, text_group_df_preprocessed], axis=1, join='inner')

    prepr_df['preprocessed_comments'] = text_group_df["–¢–µ–∫—Å—Ç_–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤"].apply(unpack_comments).apply(literal_eval).apply(preprocess_text_in_list)

    prepr_df.to_csv(f'Preprocessed/{group[:-5]}.csv')
    measure_time(st)

Working on –ë–æ–Ω—á_–º–µ–º—ã_all_posts_31_07_2023.xlsx
Execution time: 0 hours 0 minutes 7 seconds
Working on –ë–æ–Ω—á_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx
Execution time: 0 hours 5 minutes 8 seconds
Working on –ë–æ–Ω—á_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ_all_posts_31_07_2023.xlsx
Execution time: 0 hours 14 minutes 33 seconds
Working on –ë–æ–Ω—á_–ø–æ—Å—Ç—É–ø–ª–µ–Ω–∏–µ_all_posts_31_07_2023.xlsx
Execution time: 0 hours 1 minutes 16 seconds
Working on –í–®–≠_–º–µ–º—ã_all_posts_31_07_2023.xlsx
Execution time: 0 hours 1 minutes 24 seconds
Working on –í–®–≠_–æ—Å–Ω–æ–≤–Ω–∞—è_all_posts_31_07_2023.xlsx
Execution time: 0 hours 4 minutes 57 seconds
Working on –í–®–≠_–ø–æ–¥—Å–ª—É—à–∞–Ω–æ.xlsx
Execution time: 0 hours 20 minutes 26 seconds
Working on –í–®–≠_–ø–æ—Å—Ç—É–ø–ª–µ–Ω–∏–µ_all_posts_31_07_2023.xlsx
Execution time: 0 hours 3 minutes 47 seconds
Working on –ì–£–ê–ü_–º–µ–º—ã_all_posts_31_07_2023.xlsx
Execution time: 0 hours 0 minutes 7 seconds
Working on –ì–£–ê–ü_–æ—Å–Ω_all_posts_31_07_2023.xlsx
Execution

In [12]:
prepr_groups_files = os.listdir('Preprocessed')

for group in prepr_groups_files:
    group_df = pd.read_csv(f'Preprocessed/{group}')
    group_df = group_df[['id_–ø–æ—Å—Ç–∞', 'text_tokens', 'text_length', 'preprocessed_comments']] #store only new comments to reduce memory consumption
    group_df.to_csv(f'Preprocessed/{group[:-4]}.csv')

In [14]:
group_df

Unnamed: 0,id_–ø–æ—Å—Ç–∞,–¢–µ–∫—Å—Ç_–ø–æ—Å—Ç–∞,–î–∞—Ç–∞_–ø—É–±–ª–∏–∫–∞—Ü–∏–∏_–ø–æ—Å—Ç–∞,–ß–∏—Å–ª–æ_–ª–∞–π–∫–æ–≤,–ß–∏—Å–ª–æ_–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤,–ß–∏—Å–ª–æ_–ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤,URL_photo,–¢–µ–∫—Å—Ç_–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–µ–≤,text_tokens,text_length,preprocessed_comments
0,52701,–°–æ—Ö—Ä–∞–Ω–∏ —Å–µ–±–µ –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ!\n\n–í —Å—Ç–∞—Ç—å–µ —Ç—ã –Ω–∞–π–¥–µ...,2023-07-05 13:30:00,37,14,16682,-----,['üß°–ö–æ–Ω—Ç–∞–∫—Ç—ã –ø—Ä–∏–µ–º–Ω–æ–π –∫–æ–º–∏—Å—Å–∏–∏ –ì–ò: \nüìû–¢–µ–ª–µ—Ñ–æ–Ω: ...,—Å–æ—Ö—Ä–∞–Ω–∏—Ç—å –æ–±—è–∑–∞—Ç–µ–ª—å–Ω–æ —Å—Ç–∞—Ç—å—è –Ω–∞–π—Ç–∏ —Å–≤—è–∑–∞—Ç—å—Å—è –ø...,20,[('–∫–æ–Ω—Ç–∞–∫—Ç –ø—Ä–∏—ë–º–Ω—ã–π –∫–æ–º–∏—Å—Å–∏—è –≥–∏ —Ç–µ–ª–µ—Ñ–æ–Ω –¥–æ–± –∫–æ...
1,53005,–ß—Ç–æ —Ç–∞–∫–æ–µ –∫–æ–Ω—Ç—Ä–∞–∫—Ç–Ω–æ–µ –æ–±—É—á–µ–Ω–∏–µ?üìö\n\n–≠—Ç–æ –æ–±—É—á–µ–Ω...,2023-08-03 10:00:01,5,0,776,https://sun9-69.userapi.com/impg/nrTBJc6NNh8QV...,[],–∫–æ–Ω—Ç—Ä–∞–∫—Ç–Ω—ã–π –æ–±—É—á–µ–Ω–∏–µ –æ–±—É—á–µ–Ω–∏–µ –∫–æ—Ç–æ—Ä—ã–π —Å—Ç—É–¥–µ–Ω—Ç ...,71,[]
2,52994,üöÄ 5 3 1 - —ç—Ç–æ –Ω–µ –æ—Ç—Å—á–µ—Ç –¥–æ –°–¢–ê–†–¢–∞\n\n–≠—Ç–æ –ø—è—Ç—å—Å...,2023-08-02 11:10:00,37,3,4531,https://sun9-16.userapi.com/impg/Ywej8r9IRnHX_...,"['–¢–∞–∫–æ–π –ø–æ–ø—É–ª—è—Ä–Ω—ã–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç, –∞ –º–µ—Å—Ç –Ω–∞ –±—é–¥–∂...",–æ—Ç—Å—á—ë—Ç —Å—Ç–∞—Ä—Ç —Å—á–∞—Å—Ç–ª–∏–≤—ã–π –ø–æ–ª–∏—Ç–µ—Ö–Ω–∏–∫ –≥–ª–∞–¥–∏—Ç—å –±–µ–ª...,98,[('–ø–æ–ø—É–ª—è—Ä–Ω—ã–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç –º–µ—Å—Ç–æ –±—é–¥–∂–µ—Ç –æ—á–µ–Ω—å –¥...
3,52992,"–ë—É–¥—É—â–∏–µ –º–∞–≥–∏—Å—Ç—Ä—ã, –Ω–∞–ø–æ–º–∏–Ω–∞–Ω–∏–µ –¥–ª—è –≤–∞—Åüéì\n \n‚ùó4 ...",2023-08-02 09:00:02,5,0,3166,https://sun9-4.userapi.com/impg/ybV72ZhMg5nY1t...,[],–±—É–¥—É—â–∏–π –º–∞–≥–∏—Å—Ç—Ä –Ω–∞–ø–æ–º–∏–Ω–∞–Ω–∏–µ –∞–≤–≥—É—Å—Ç –∑–∞–∫–∞–Ω—á–∏–≤–∞—Ç—å...,30,[]
4,52982,‚òò –ü–æ–ª–∏—Ç–µ—Ö –≤–æ—à—ë–ª –≤ —Ç–æ–ø-10 ¬´–∑–µ–ª—ë–Ω—ã—Ö¬ª –≤—É–∑–æ–≤ –†–æ—Å—Å–∏...,2023-08-01 14:01:20,129,0,7781,https://sun9-51.userapi.com/impg/ETubHW499FZwJ...,[],–ø–æ–ª–∏—Ç–µ—Ö –≤–æ—à –ª —Ç–æ–ø –∑–µ–ª –Ω—ã—Ö –≤—É–∑ —Ä–æ—Å—Å–∏—è –ø–æ–ª–∏—Ç–µ—Ö –ø...,54,[]
...,...,...,...,...,...,...,...,...,...,...,...
3210,10,"–ß—Ç–æ, –≥–¥–µ, –∫–æ–≥–¥–∞: —ç–∫—Å–ø–µ—Ä—Ç –°–ü–±–ü–£ ‚Äì –æ –ø—Ä–∏–µ–º–Ω–æ–π –∫–∞...",2016-06-23 11:35:34,8,0,-----,-----,[],—ç–∫—Å–ø–µ—Ä—Ç —Å–ø–±–ø –ø—Ä–∏—ë–º–Ω—ã–π –∫–∞–º–ø–∞–Ω–∏—è,4,[]
3211,8,–î–æ—Ä–æ–≥–∏–µ –∞–±–∏—Ç—É—Ä–∏–µ–Ω—Ç—ã!\n\n–í—Å–µ –∏–Ω—Ç–µ—Ä–µ—Å—É—é—â–∏–µ –æ—Ç–≤–µ—Ç...,2016-06-21 14:54:19,23,4,-----,https://sun9-58.userapi.com/impf/c630916/v6309...,"['–û, –º–æ–ª–æ–¥—Ü—ã! –†–µ–∞–ª—å–Ω–æ –∫—Ä—É—Ç–æ!', '–î–æ–±—Ä—ã–π –¥–µ–Ω—å, –∞...",–¥–æ—Ä–æ–≥–æ–π –∞–±–∏—Ç—É—Ä–∏–µ–Ω—Ç –∏–Ω—Ç–µ—Ä–µ—Å–æ–≤–∞—Ç—å –æ—Ç–≤–µ—Ç –≤–æ–ø—Ä–æ—Å —Å...,103,"[('–º–æ–ª–æ–¥–µ—Ü —Ä–µ–∞–ª—å–Ω–æ –∫—Ä—É—Ç–æ', 3), ('–¥–æ–±—Ä—ã–π –¥–µ–Ω—å –≤..."
3212,7,20 –∏—é–Ω—è –≤ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥—Å–∫–æ–º –ø–æ–ª–∏—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–º ...,2016-06-20 11:32:29,19,0,-----,https://sun9-55.userapi.com/impf/c630916/v6309...,[],–∏—é–Ω—å —Å–∞–Ω–∫—Ç –ø–µ—Ç–µ—Ä–±—É—Ä–≥—Å–∫–∏–π –ø–æ–ª–∏—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π —É–Ω–∏–≤–µ...,90,[]
3213,5,–í –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥—Å–∫–æ–º –ø–æ–ª–∏—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏...,2016-06-07 23:43:02,33,0,-----,-----,[],—Å–∞–Ω–∫—Ç –ø–µ—Ç–µ—Ä–±—É—Ä–≥—Å–∫–∏–π –ø–æ–ª–∏—Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–π —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ...,79,[]
