# Vk data preprocessing for further modelling

In [1]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from pymystem3 import Mystem
from tqdm import tqdm_notebook

from string import punctuation

import sys
sys.path.append('../')
from src.utils import remove_name_from_text
from src.log_progress import log_progress

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
raw_data =  pd.read_csv('vk_posts_adhoc.csv').loc[:, ['id', 'date', 'owner_id', 'text', 'likes', 'reposts', 'comments']]
raw_data.head()

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments
0,4259,1547411913,-20762107,–í—ã—Å—à–∞—è –ª–∏–≥–∞ –ê. 9 —Ç—É—Ä. 2 –¥–µ–Ω—å / –†–µ–∑—É–ª—å—Ç–∞—Ç—ã üìã\n\...,12,0,0
1,9896,1547405952,-45668536,12-13 —è–Ω–≤–∞—Ä—è –≤ —Ä–∞–º–∫–∞—Ö 9-–≥–æ —Ç—É—Ä–∞ –ß–µ–º–ø–∏–æ–Ω–∞—Ç–∞ –†–æ—Å...,17,0,0
2,17976,1547396554,-64025511,,17,3,0
3,17974,1547396064,-64025511,–ù–∞—Å—Ç–∞–≤–Ω–∏–∫ ¬´–¢—é–º–µ–Ω–∏-–¢—é–º–ì–£¬ª –ù–∞—Ç–∞–ª—å—è –í–∞—Å–∏–ª—å—á–µ–Ω–∫–æ -...,13,3,0
4,5593,1547394772,-8775943,–Ø–Ω–∞ –ß–µ—Ä–µ–¥–Ω–∏–∫–æ–≤–∞: ¬´–ó–∞ –ø–µ—Ä–≤—É—é –ø–∞—Ä—Ç–∏—é –Ω–∞—Å —Ä—É–≥–∞—é—Ç¬ª...,6,2,0


In [3]:
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤:', len(raw_data))

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤: 24622


In [4]:
#vk_posts['date'] = pd.to_datetime(vk_posts['date'], unit='s')
#vk_posts = vk_posts.sort_values(by='date')
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è –ø–æ—Å—Ç–æ–≤:', raw_data.duplicated(subset='text').sum())

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è –ø–æ—Å—Ç–æ–≤: 3051


In [5]:
data = raw_data.drop_duplicates(subset='text').reset_index(drop=True)
print('–ü–æ—Å—Ç–æ–≤ –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤:', len(data))

–ü–æ—Å—Ç–æ–≤ –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤: 21571


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21571 entries, 0 to 21570
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        21571 non-null  int64 
 1   date      21571 non-null  int64 
 2   owner_id  21571 non-null  int64 
 3   text      21570 non-null  object
 4   likes     21571 non-null  int64 
 5   reposts   21571 non-null  int64 
 6   comments  21571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 1.1+ MB


In [7]:
data['date'] = pd.to_datetime(data['date'], unit='s')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21571 entries, 0 to 21570
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        21571 non-null  int64         
 1   date      21571 non-null  datetime64[ns]
 2   owner_id  21571 non-null  int64         
 3   text      21570 non-null  object        
 4   likes     21571 non-null  int64         
 5   reposts   21571 non-null  int64         
 6   comments  21571 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 1.1+ MB


In [8]:
data = data.dropna().reset_index(drop=True)
print('–ü–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –æ—Å—Ç–∞–ª–æ—Å—å', len(data), '–∑–∞–ø–∏—Å–µ–π')

–ü–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –æ—Å—Ç–∞–ª–æ—Å—å 21570 –∑–∞–ø–∏—Å–µ–π


In [9]:
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è —Å—Ç—Ä–æ–∫:', data.duplicated().sum())

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è —Å—Ç—Ä–æ–∫: 0


In [14]:
data['txt_processed'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x.lower()))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\n+', ' ', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\d+', '', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'id\w+', '', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'[A-Za-z]+', '', x))

data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: x.strip())
data['txt_processed'].tail()

21565    –Ω–æ–≤—ã–π –≥–æ–¥ –ª—é–±–∏–º—ã–π —Å –¥–µ—Ç—Å—Ç–≤–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫ —Å–∞–º—ã–π –∫—Ä–∞...
21566    _ –≥–æ–¥ —Å—Ç–∞–ª –≥–æ–¥–æ–º –∏–Ω–Ω–æ–≤–∞—Ü–∏–π —Å—Ç–∏–º—É–ª–æ–º –ø–æ–∏—Å–∫–∞ –Ω–æ–≤...
21567    –∞—Å—Å–æ—Ä—Ç–∏ –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–π –¥–µ–∫–∞–±—Ä—å —á–∞—Å—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞–µ—Ç—Å—è ...
21568    —Ç–æ–ø —Ä–∞–∑—Ä–∞–±–æ—Ç–æ–∫ —Å–∏–±–∏—Ä—Å–∫–∏—Ö —É—á–µ–Ω—ã—Ö –≤ –≥–æ–¥—É –Ω–∞ –ø–æ—Ä—Ç...
21569    –ø–æ–∏—Å–∫ –≤–æ–ª–æ–Ω—Ç—ë—Ä–æ–≤ —Å —è–Ω–≤–∞—Ä—è –ø–æ —Ñ–µ–≤—Ä–∞–ª—è –≤ —Ç—é–º–≥—É –ø...
Name: txt_processed, dtype: object

In [15]:
lem = Mystem()
nltk.download('stopwords')
stop = stopwords.words('russian')

def preprocess_text(x):
    word_list = []
    for word in (''.join(lem.lemmatize(x))).split(' '):
        if word not in stop and word != '':
            word_list.append(word.rstrip())
    return word_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\–ó–∏–º–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
stop.extend([
u'—è', u'–∞', u'–¥–∞', u'–Ω–æ', u'—Ç–µ–±–µ', u'–º–Ω–µ', u'—Ç—ã', u'–∏', u'—É', u'–Ω–∞', u'—â–∞', u'–∞–≥–∞',
u'—Ç–∞–∫', u'—Ç–∞–º', u'–∫–∞–∫–∏–µ', u'–∫–æ—Ç–æ—Ä—ã–π', u'–∫–∞–∫–∞—è', u'—Ç—É–¥–∞', u'–¥–∞–≤–∞–π', u'–∫–æ—Ä–æ—á–µ', u'–∫–∞–∂–µ—Ç—Å—è', u'–≤–æ–æ–±—â–µ',
u'–Ω—É', u'–Ω–µ', u'—á–µ—Ç', u'–Ω–µ–∞', u'—Å–≤–æ–∏', u'–Ω–∞—à–µ', u'–Ω–∞—à', u'–≤–µ—Å—å', u'—Ö–æ—Ç—è', u'—Ç–∞–∫–æ–µ', u'–Ω–∞–ø—Ä–∏–º–µ—Ä', u'–∫–∞—Ä–æ—á', u'–∫–∞–∫-—Ç–æ',
u'–Ω–∞–º', u'—Ö–º', u'–≤—Å–µ–º', u'–Ω–µ—Ç', u'–¥–∞', u'–æ–Ω–æ', u'—Å–≤–æ–µ–º', u'–ø—Ä–æ', u'–≤—ã', u'–º', u'—Ç–¥',
u'–≤—Å—è', u'–∫—Ç–æ-—Ç–æ', u'—á—Ç–æ-—Ç–æ', u'–≤–∞–º', u'—ç—Ç–æ', u'—ç—Ç–∞', u'—ç—Ç–∏', u'—ç—Ç–æ—Ç', u'–ø—Ä—è–º', u'–ª–∏–±–æ', u'–∫–∞–∫', u'–º—ã',
u'–ø—Ä–æ—Å—Ç–æ', u'–±–ª–∏–Ω', u'–æ—á–µ–Ω—å', u'—Å–∞–º—ã–µ', u'—Ç–≤–æ–µ–º', u'–≤–∞—à–∞', u'–∫—Å—Ç–∞—Ç–∏', u'–≤—Ä–æ–¥–µ', u'—Ç–∏–ø–∞', u'–ø–æ–∫–∞', u'–æ–∫',
u'–º–æ—á—å'
])

In [17]:
text_data = data['txt_processed'].to_numpy()

text_data

array(['–≤—ã—Å—à–∞—è –ª–∏–≥–∞ –∞ —Ç—É—Ä –¥–µ–Ω—å —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Ç—é–º–µ–Ω—å —Ç—é–º–≥—É —Ç—é–º–µ–Ω—å –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ –≤–ª–∞–¥–∏–≤–æ—Å—Ç–æ–∫ —Å–∞–º—Ä–∞—É —É–≥–Ω—Ç—É —É—Ñ–∞ —é–∑–≥—É –∞—Ç–æ–º –∫—É—Ä—Å–∫–∞—è –æ–±–ª–∞—Å—Ç—å —Å–ø–∞—Ä—Ç–∞ –Ω–∏–∂–Ω–∏–π –Ω–æ–≤–≥–æ—Ä–æ–¥ –¥–∏–Ω–∞–º–æ –∫–∞–∑–∞–Ω—å —É–æ—Ä –∫–∞–∑–∞–Ω—å –ª—É—á –º–æ—Å–∫–≤–∞ —Å–µ–≤–µ—Ä—è–Ω–∫–∞ —á–µ—Ä–µ–ø–æ–≤–µ—Ü —Ç—É–ª–∏—Ü–∞ —Ç—É–ª–∞ –ª–∏–ø–µ—Ü–∫ –ª–∏–ø–µ—Ü–∫–∞—è –æ–±–ª –æ–ª–∏–º–ø –∫—É–π–±—ã—à–µ–≤ –∏–º–ø—É–ª—å—Å –≤–æ–ª–≥–æ–¥–æ–Ω—Å–∫ —Å–≤–æ–±–æ–¥–Ω—ã–π –æ—Ç –∏–≥—Ä —Ç—É—Ä —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏–µ –∫–æ–º–∞–Ω–¥ —Å–µ–≤–µ—Ä—è–Ω–∫–∞ —Å–ø–∞—Ä—Ç–∞ —Ç—É–ª–∏—Ü–∞ —Å–∞–º—Ä–∞—É –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ —Ç—é–º–µ–Ω—å —Ç—é–º–≥—É —é–∑–≥—É –∞—Ç–æ–º –¥–∏–Ω–∞–º–æ —É–æ—Ä –ª–∏–ø–µ—Ü–∫ –æ–ª–∏–º–ø –∏–º–ø—É–ª—å—Å –ª—É—á –≤–∫—Å–ø–∞—Ä—Ç–∞ –≤–æ–ª–µ–π–±–æ–ª –≤—ã—Å—à–∞—è–ª–∏–≥–∞–∞',
       '—è–Ω–≤–∞—Ä—è –≤ —Ä–∞–º–∫–∞—Ö –≥–æ —Ç—É—Ä–∞ —á–µ–º–ø–∏–æ–Ω–∞—Ç–∞ —Ä–æ—Å—Å–∏–∏ —Å—Ä–µ–¥–∏ –∂–µ–Ω—Å–∫–∏—Ö –∫–æ–º–∞–Ω–¥ –≤—ã—Å—à–µ–π –ª–∏–≥–∏ –∞ –≤ –Ω–∏–∂–Ω–µ–º –Ω–æ–≤–≥–æ—Ä–

In [18]:
text_data_lemmatized = []
for text in log_progress(text_data, every=1):
    text_data_lemmatized.append(preprocess_text(text))

VBox(children=(HTML(value=''), IntProgress(value=0, max=21570)))

In [19]:
text_data_lemmatized[0][:10]

['–≤—ã—Å–æ–∫–∏–π',
 '–ª–∏–≥–∞',
 '—Ç—É—Ä',
 '–¥–µ–Ω—å',
 '—Ä–µ–∑—É–ª—å—Ç–∞—Ç',
 '—Ç—é–º–µ–Ω—å',
 '—Ç—é–º–≥–∞',
 '—Ç—é–º–µ–Ω—å',
 '–ø—Ä–∏–º–æ—Ä–æ—á–∫–∞',
 '–≤–ª–∞–¥–∏–≤–æ—Å—Ç–æ–∫']

In [20]:
data['lines_lemmatized'] = text_data_lemmatized
data.head()

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments,txt_processed,lines_lemmatized
0,4259,2019-01-13 20:38:33,-20762107,–í—ã—Å—à–∞—è –ª–∏–≥–∞ –ê. 9 —Ç—É—Ä. 2 –¥–µ–Ω—å / –†–µ–∑—É–ª—å—Ç–∞—Ç—ã üìã\n\...,12,0,0,–≤—ã—Å—à–∞—è –ª–∏–≥–∞ –∞ —Ç—É—Ä –¥–µ–Ω—å —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã —Ç—é–º–µ–Ω—å —Ç—é–º–≥—É...,"[–≤—ã—Å–æ–∫–∏–π, –ª–∏–≥–∞, —Ç—É—Ä, –¥–µ–Ω—å, —Ä–µ–∑—É–ª—å—Ç–∞—Ç, —Ç—é–º–µ–Ω—å, ..."
1,9896,2019-01-13 18:59:12,-45668536,12-13 —è–Ω–≤–∞—Ä—è –≤ —Ä–∞–º–∫–∞—Ö 9-–≥–æ —Ç—É—Ä–∞ –ß–µ–º–ø–∏–æ–Ω–∞—Ç–∞ –†–æ—Å...,17,0,0,—è–Ω–≤–∞—Ä—è –≤ —Ä–∞–º–∫–∞—Ö –≥–æ —Ç—É—Ä–∞ —á–µ–º–ø–∏–æ–Ω–∞—Ç–∞ —Ä–æ—Å—Å–∏–∏ —Å—Ä–µ–¥...,"[—è–Ω–≤–∞—Ä—å, —Ä–∞–º–∫–∞, –≥–æ, —Ç—É—Ä, —á–µ–º–ø–∏–æ–Ω–∞—Ç, —Ä–æ—Å—Å–∏—è, —Å—Ä..."
2,17974,2019-01-13 16:14:24,-64025511,–ù–∞—Å—Ç–∞–≤–Ω–∏–∫ ¬´–¢—é–º–µ–Ω–∏-–¢—é–º–ì–£¬ª –ù–∞—Ç–∞–ª—å—è –í–∞—Å–∏–ª—å—á–µ–Ω–∫–æ -...,13,3,0,–Ω–∞—Å—Ç–∞–≤–Ω–∏–∫ —Ç—é–º–µ–Ω–∏ —Ç—é–º–≥—É –Ω–∞—Ç–∞–ª—å—è –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ –æ –≤...,"[–Ω–∞—Å—Ç–∞–≤–Ω–∏–∫, —Ç—é–º–µ–Ω—å, —Ç—é–º–≥–∞, –Ω–∞—Ç–∞–ª—å—è, –≤–∞—Å–∏–ª—å—á–µ–Ω–∫..."
3,5593,2019-01-13 15:52:52,-8775943,–Ø–Ω–∞ –ß–µ—Ä–µ–¥–Ω–∏–∫–æ–≤–∞: ¬´–ó–∞ –ø–µ—Ä–≤—É—é –ø–∞—Ä—Ç–∏—é –Ω–∞—Å —Ä—É–≥–∞—é—Ç¬ª...,6,2,0,—è–Ω–∞ —á–µ—Ä–µ–¥–Ω–∏–∫–æ–≤–∞ –∑–∞ –ø–µ—Ä–≤—É—é –ø–∞—Ä—Ç–∏—é –Ω–∞—Å —Ä—É–≥–∞—é—Ç –≤–æ...,"[—è–Ω–∞, —á–µ—Ä–µ–¥–Ω–∏–∫–æ–≤–∞, –ø–µ—Ä–≤—ã–π, –ø–∞—Ä—Ç–∏—è, —Ä—É–≥–∞—Ç—å, –≤–æ–ª..."
4,5591,2019-01-13 15:00:24,-8775943,"–ù–∞—Ç–∞–ª—å—è –í–∞—Å–∏–ª—å—á–µ–Ω–∫–æ: ¬´–î–µ–≤—á–æ–Ω–∫–∏ –ø–ª–∞–∫–∞–ª–∏, —è –µ–ª–µ ...",17,2,0,–Ω–∞—Ç–∞–ª—å—è –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ –¥–µ–≤—á–æ–Ω–∫–∏ –ø–ª–∞–∫–∞–ª–∏ —è –µ–ª–µ —Å–¥–µ...,"[–Ω–∞—Ç–∞–ª—å—è, –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ, –¥–µ–≤—á–æ–Ω–∫–∞, –ø–ª–∞–∫–∞—Ç—å, –µ–ª–µ,..."


In [21]:
data.shape

(21570, 9)

In [22]:
data.tail(10)

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments,txt_processed,lines_lemmatized
21560,5649,2020-12-30 07:49:22,-45505854,–°–¢–£–î–ï–ù–¢–ö–ê –ò–®–ò–ú–°–ö–û–ì–û –ü–ï–î–ê–ì–û–ì–ò–ß–ï–°–ö–û–ì–û –ò–ù–°–¢–ò–¢–£–¢–ê ...,31,12,0,—Å—Ç—É–¥–µ–Ω—Ç–∫–∞ –∏—à–∏–º—Å–∫–æ–≥–æ –ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–æ–≥–æ –∏–Ω—Å—Ç–∏—Ç—É—Ç–∞ ...,"[—Å—Ç—É–¥–µ–Ω—Ç–∫–∞, –∏—à–∏–º—Å–∫–∏–π, –ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π, –∏–Ω—Å—Ç–∏—Ç—É—Ç..."
21561,149,2020-12-30 06:57:14,623164963,–û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —Å—Ç—É–¥–µ–Ω—Ç–æ–≤ –≤ —ç...,0,1,0,–æ—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏ –æ–±—É—á–µ–Ω–∏—è –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã—Ö —Å—Ç—É–¥–µ–Ω—Ç–æ–≤ –≤ —ç...,"[–æ—Å–æ–±–µ–Ω–Ω–æ—Å—Ç—å, –æ–±—É—á–µ–Ω–∏–µ, –∏–Ω–æ—Å—Ç—Ä–∞–Ω–Ω—ã–π, —Å—Ç—É–¥–µ–Ω—Ç, ..."
21562,982,2020-12-30 06:44:52,324527683,–ü–æ–∑–∏—Ç–∏–≤–Ω—ã–π –º–æ–º–µ–Ω—Ç –≤ –∫–æ–Ω—Ü–µ –æ—Ç–Ω—é–¥—å –Ω–µ –±–∞–ª—É—é—â–µ–≥–æ ...,22,0,8,–ø–æ–∑–∏—Ç–∏–≤–Ω—ã–π –º–æ–º–µ–Ω—Ç –≤ –∫–æ–Ω—Ü–µ –æ—Ç–Ω—é–¥—å –Ω–µ –±–∞–ª—É—é—â–µ–≥–æ ...,"[–ø–æ–∑–∏—Ç–∏–≤–Ω—ã–π, –º–æ–º–µ–Ω—Ç, –∫–æ–Ω–µ—Ü, –æ—Ç–Ω—é–¥—å, –±–∞–ª–æ–≤–∞—Ç—å, ..."
21563,980,2020-12-30 06:41:09,-92088231,#–¢–ü–ò #–¢–æ–±–æ–ª—å—Å–∫–∏–π–ü–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π–ò–Ω—Å—Ç–∏—Ç—É—Ç #–ü—Ä–∏–µ–º–Ω...,7,3,0,—Ç–ø–∏ —Ç–æ–±–æ–ª—å—Å–∫–∏–π–ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π–∏–Ω—Å—Ç–∏—Ç—É—Ç –ø—Ä–∏–µ–º–Ω–∞—è–∫...,"[—Ç–ø–∏, —Ç–æ–±–æ–ª—å—Å–∫–∏–π–ø–µ–¥–∞–≥–æ–≥–∏—á–µ—Å–∫–∏–π–∏–Ω—Å—Ç–∏—Ç—É—Ç, –ø—Ä–∏–µ–º–Ω..."
21564,35625,2020-12-30 06:30:03,-29346,[club87603622|–û–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã–π —Å–æ–≤–µ—Ç –æ–±—É—á–∞—é—â–∏—Ö—Å—è –¢...,88,7,0,–æ–±—ä–µ–¥–∏–Ω–µ–Ω–Ω—ã–π —Å–æ–≤–µ—Ç –æ–±—É—á–∞—é—â–∏—Ö—Å—è —Ç—é–º–≥—É –ø–æ–±–µ–¥–∏–ª –≤...,"[–æ–±—ä–µ–¥–∏–Ω—è—Ç—å, —Å–æ–≤–µ—Ç, –æ–±—É—á–∞—Ç—å—Å—è, —Ç—é–º–≥–∞, –ø–æ–±–µ–∂–¥–∞—Ç..."
21565,8938,2020-12-30 06:26:39,-23769646,"‚òÉüí´–ù–æ–≤—ã–π –≥–æ–¥ - –ª—é–±–∏–º—ã–π —Å –¥–µ—Ç—Å—Ç–≤–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫, —Å–∞–º—ã...",8,1,0,–Ω–æ–≤—ã–π –≥–æ–¥ –ª—é–±–∏–º—ã–π —Å –¥–µ—Ç—Å—Ç–≤–∞ –ø—Ä–∞–∑–¥–Ω–∏–∫ —Å–∞–º—ã–π –∫—Ä–∞...,"[–Ω–æ–≤—ã–π, –≥–æ–¥, –ª—é–±–∏–º—ã–π, –¥–µ—Ç—Å—Ç–≤–æ, –ø—Ä–∞–∑–¥–Ω–∏–∫, —Å–∞–º—ã–π..."
21566,679,2020-12-30 06:23:10,-157348679,#event@rumts_utmn \n\n‚ú®2020 –≥–æ–¥ —Å—Ç–∞–ª –≥–æ–¥–æ–º –∏–Ω–Ω...,6,0,0,_ –≥–æ–¥ —Å—Ç–∞–ª –≥–æ–¥–æ–º –∏–Ω–Ω–æ–≤–∞—Ü–∏–π —Å—Ç–∏–º—É–ª–æ–º –ø–æ–∏—Å–∫–∞ –Ω–æ–≤...,"[_, –≥–æ–¥, —Å—Ç–∞–Ω–æ–≤–∏—Ç—å—Å—è, –≥–æ–¥, –∏–Ω–Ω–æ–≤–∞—Ü–∏—è, —Å—Ç–∏–º—É–ª, ..."
21567,220,2020-12-30 06:08:00,-192630663,üç¨–ê–°–°–û–†–¢–ò –î–û–°–¢–ò–ñ–ï–ù–ò–ô: –î–ï–ö–ê–ë–†–¨ 2020üç¨ –ß–∞—Å—Ç—å 1.\n ...,10,2,0,–∞—Å—Å–æ—Ä—Ç–∏ –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–π –¥–µ–∫–∞–±—Ä—å —á–∞—Å—Ç—å –ø—Ä–∏–±–ª–∏–∂–∞–µ—Ç—Å—è ...,"[–∞—Å—Å–æ—Ä—Ç–∏, –¥–æ—Å—Ç–∏–∂–µ–Ω–∏–µ, –¥–µ–∫–∞–±—Ä—å, —á–∞—Å—Ç—å, –ø—Ä–∏–±–ª–∏–∂–∞..."
21568,18598,2020-12-30 05:26:51,-135160071,–¢–æ–ø-30 —Ä–∞–∑—Ä–∞–±–æ—Ç–æ–∫ —Å–∏–±–∏—Ä—Å–∫–∏—Ö —É—á–µ–Ω—ã—Ö –≤ 2020 –≥–æ–¥—É...,1,0,0,—Ç–æ–ø —Ä–∞–∑—Ä–∞–±–æ—Ç–æ–∫ —Å–∏–±–∏—Ä—Å–∫–∏—Ö —É—á–µ–Ω—ã—Ö –≤ –≥–æ–¥—É –Ω–∞ –ø–æ—Ä—Ç...,"[—Ç–æ–ø, —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞, —Å–∏–±–∏—Ä—Å–∫–∏–π, —É—á–µ–Ω—ã–π, –≥–æ–¥, –ø–æ—Ä—Ç..."
21569,6076,2020-12-30 05:18:40,-142012747,–ü–æ–∏—Å–∫ –≤–æ–ª–æ–Ω—Ç—ë—Ä–æ–≤!\n \n–° 15 —è–Ω–≤–∞—Ä—è –ø–æ 11 —Ñ–µ–≤—Ä–∞–ª...,7,14,0,–ø–æ–∏—Å–∫ –≤–æ–ª–æ–Ω—Ç—ë—Ä–æ–≤ —Å —è–Ω–≤–∞—Ä—è –ø–æ —Ñ–µ–≤—Ä–∞–ª—è –≤ —Ç—é–º–≥—É –ø...,"[–ø–æ–∏—Å–∫, –≤–æ–ª–æ–Ω—Ç–µ—Ä, —è–Ω–≤–∞—Ä—å, —Ñ–µ–≤—Ä–∞–ª—å, —Ç—é–º–≥–∞, –ø—Ä–æ–π..."


In [24]:
data.to_csv('vk_posts_2019-2020_lemmatized.csv', encoding='utf-8')