# Vk data preprocessing for further modelling

In [15]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from pymystem3 import Mystem
from tqdm import tqdm_notebook

from string import punctuation

import sys
sys.path.append('../')
from src.utils import remove_name_from_text
from src.log_progress import log_progress

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
raw_data =  pd.read_csv('vk_posts.csv').loc[:, ['id', 'date', 'owner_id', 'text', 'likes', 'reposts', 'comments']]
raw_data.head()

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments
0,66,1322907480,-1243876,"–ì–û–õ–û–°–£–ï–ú –ó–ê –ë–û–ì–£!!\n\n""–î–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è! –ì–æ–ª–æ—Å—É–π...",6,1,1
1,10588,1332165525,695803,"–ü–æ–∑–∞–≤—á–µ—Ä–∞ —è –Ω–∞–ø–∏—Å–∞–ª –ø–æ—Å—Ç –ø—Ä–æ —Ç–æ, –∫–∞–∫ –º—ã –∑–∞–µ—Ö–∞–ª...",8,3,16
2,800,1334033740,102648080,–í —ç—Ç–∏ –º–∏–Ω—É—Ç—ã –∏–¥–µ—Ç –ø—Ä—è–º–∞—è —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è –ª–µ–∫—Ü–∏–∏ –ø—Ä–æ...,2,1,0
3,5810,1336373356,70549719,12 –∞–ø—Ä–µ–ª—è 2012 –≥. –≤ –¢—é–º–µ–Ω—Å–∫–æ–º –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–º ...,1,1,0
4,5003,1352718600,61866128,"–î–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è, –∂—É—Ä–Ω–∞–ª ""–ï–ì–û–†–ö–ê"" —É—á–∞—Å—Ç–≤—É–µ—Ç –≤ –∫–æ...",2,1,2


In [3]:
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤:', len(raw_data))

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Å—Ç–æ–≤: 74483


In [4]:
#vk_posts['date'] = pd.to_datetime(vk_posts['date'], unit='s')
#vk_posts = vk_posts.sort_values(by='date')
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è –ø–æ—Å—Ç–æ–≤:', raw_data.duplicated(subset='text').sum())

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è –ø–æ—Å—Ç–æ–≤: 17648


In [5]:
data = raw_data.drop_duplicates(subset='text').reset_index(drop=True)
print('–ü–æ—Å—Ç–æ–≤ –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤:', len(data))

–ü–æ—Å—Ç–æ–≤ –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –¥—É–±–ª–∏–∫–∞—Ç–æ–≤: 56835


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56835 entries, 0 to 56834
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        56835 non-null  int64 
 1   date      56835 non-null  int64 
 2   owner_id  56835 non-null  int64 
 3   text      56834 non-null  object
 4   likes     56835 non-null  int64 
 5   reposts   56835 non-null  int64 
 6   comments  56835 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 2.8+ MB


In [7]:
data['date'] = pd.to_datetime(data['date'], unit='s')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56835 entries, 0 to 56834
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   id        56835 non-null  int64         
 1   date      56835 non-null  datetime64[ns]
 2   owner_id  56835 non-null  int64         
 3   text      56834 non-null  object        
 4   likes     56835 non-null  int64         
 5   reposts   56835 non-null  int64         
 6   comments  56835 non-null  int64         
dtypes: datetime64[ns](1), int64(5), object(1)
memory usage: 2.8+ MB


In [8]:
data = data.dropna().reset_index(drop=True)
print('–ü–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –æ—Å—Ç–∞–ª–æ—Å—å', len(data), '–∑–∞–ø–∏—Å–µ–π')

–ü–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π –æ—Å—Ç–∞–ª–æ—Å—å 56834 –∑–∞–ø–∏—Å–µ–π


In [9]:
print('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è —Å—Ç—Ä–æ–∫:', data.duplicated().sum())

–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥—É–±–ª–∏—Ä—É—é—â–∏—Ö—Å—è —Å—Ç—Ä–æ–∫: 0


In [10]:
data['txt_processed'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\n+', ' ', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\d+', '', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'id\w+', '', x))

data['txt_processed'] = data['txt_processed'].apply(lambda x: re.sub(r'\s+', ' ', x))
data['txt_processed'] = data['txt_processed'].apply(lambda x: x.strip())
data['txt_processed'].tail()

56829    —Å—Å—ã–ª–∫–∞ –Ω–∞ —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—é —Å–µ–≥–æ–¥–Ω—è—à–Ω–µ–≥–æ –º–∞—Ç—á–∞ —Ç—é–º–µ–Ω—å...
56830    –µ–≤–≥–µ–Ω–∏—è –±–∞—Å–∞–∫–æ–≤–∞ —Ä–∞–Ω–æ –ø–æ–≤–µ—Ä–∏–ª–∏ –≤ –ø–æ–±–µ–¥—É –≤ –æ—á–µ—Ä...
56831    –∞–Ω–Ω–∞ –ø–æ—Å–ø–µ–ª–æ–≤–∞ –±—ã–ª–∞ –∫–∞–∫–∞—è—Ç–æ –ø–∞–Ω–∏–∫–∞ –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ ...
56832    –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ –ø—Ä–æ–≤–µ–ª–∞ –ø–µ—Ä–≤—É—é –≤ —ç—Ç–æ–º –≥–æ–¥—É –≤—Å—Ç—Ä–µ—á—É ...
56833    –Ω–∞—Ç–∞–ª—å—è –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ –Ω–∞—Å –ø–æ–¥–≤–µ–ª–∏ —Ç–æ–Ω–∫–æ—Å—Ç–∏ –ø—Å–∏—Ö–æ...
Name: txt_processed, dtype: object

In [11]:
lem = Mystem()
nltk.download('stopwords')
stop = stopwords.words('russian')

def preprocess_text(x):
    word_list = []
    for word in (''.join(lem.lemmatize(x))).split(' '):
        if word not in stop and word != '':
            word_list.append(word.rstrip())
    return word_list

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\–ó–∏–º–∞\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stop.extend([
u'—è', u'–∞', u'–¥–∞', u'–Ω–æ', u'—Ç–µ–±–µ', u'–º–Ω–µ', u'—Ç—ã', u'–∏', u'—É', u'–Ω–∞', u'—â–∞', u'–∞–≥–∞',
u'—Ç–∞–∫', u'—Ç–∞–º', u'–∫–∞–∫–∏–µ', u'–∫–æ—Ç–æ—Ä—ã–π', u'–∫–∞–∫–∞—è', u'—Ç—É–¥–∞', u'–¥–∞–≤–∞–π', u'–∫–æ—Ä–æ—á–µ', u'–∫–∞–∂–µ—Ç—Å—è', u'–≤–æ–æ–±—â–µ',
u'–Ω—É', u'–Ω–µ', u'—á–µ—Ç', u'–Ω–µ–∞', u'—Å–≤–æ–∏', u'–Ω–∞—à–µ', u'–Ω–∞—à', u'–≤–µ—Å—å', u'—Ö–æ—Ç—è', u'—Ç–∞–∫–æ–µ', u'–Ω–∞–ø—Ä–∏–º–µ—Ä', u'–∫–∞—Ä–æ—á', u'–∫–∞–∫-—Ç–æ',
u'–Ω–∞–º', u'—Ö–º', u'–≤—Å–µ–º', u'–Ω–µ—Ç', u'–¥–∞', u'–æ–Ω–æ', u'—Å–≤–æ–µ–º', u'–ø—Ä–æ', u'–≤—ã', u'–º', u'—Ç–¥',
u'–≤—Å—è', u'–∫—Ç–æ-—Ç–æ', u'—á—Ç–æ-—Ç–æ', u'–≤–∞–º', u'—ç—Ç–æ', u'—ç—Ç–∞', u'—ç—Ç–∏', u'—ç—Ç–æ—Ç', u'–ø—Ä—è–º', u'–ª–∏–±–æ', u'–∫–∞–∫', u'–º—ã',
u'–ø—Ä–æ—Å—Ç–æ', u'–±–ª–∏–Ω', u'–æ—á–µ–Ω—å', u'—Å–∞–º—ã–µ', u'—Ç–≤–æ–µ–º', u'–≤–∞—à–∞', u'–∫—Å—Ç–∞—Ç–∏', u'–≤—Ä–æ–¥–µ', u'—Ç–∏–ø–∞', u'–ø–æ–∫–∞', u'–æ–∫',
u'–º–æ—á—å'
])

In [13]:
text_data = data['txt_processed'].to_numpy()

text_data

array(['–≥–æ–ª–æ—Å—É–µ–º –∑–∞ –±–æ–≥—É –¥–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è –≥–æ–ª–æ—Å—É–π—Ç–µ –∑–∞ –º–æ–π —Å–±–æ—Ä–Ω–∏–∫ —è –∑–∞–ø—É—â—É –≤–∞—Å –≤ –Ω–µ–±–µ—Å–∞ –∏ –∫–Ω–∏–≥—É –º–∏—Ä–æ—Å–ª–∞–≤–∞ –±–∞–∫—É–ª–∏–Ω–∞ –∑—É–±—ã –≥—Ä–µ—à–Ω–∏–∫–æ–≤ –Ω–∞ –∫–Ω–∏–≥–µ –≥–æ–¥–∞ –∫–∞–∫ –≥–æ–ª–æ—Å–æ–≤–∞—Ç—å –∑–¥–µ—Å—å –≤—Å—ë –Ω–∞–ø–∏—Å–∞–Ω–æ',
       '–ø–æ–∑–∞–≤—á–µ—Ä–∞ —è –Ω–∞–ø–∏—Å–∞–ª –ø–æ—Å—Ç –ø—Ä–æ —Ç–æ –∫–∞–∫ –º—ã –∑–∞–µ—Ö–∞–ª–∏ –∑–∞ –ø–æ–¥—Ä—É–≥–æ–π –≤ –æ–±—â–µ–∂–∏—Ç–∏–µ —Ç—é–º–≥—É –∞ –µ—ë –∏–∑ –Ω–µ–≥–æ –Ω–µ –≤—ã–ø—É—Å—Ç–∏–ª–∏ –º–Ω–µ –ø—Ä–∏–¥—ë—Ç—Å—è —Å–µ–π—á–∞—Å –µ–≥–æ –ø—Ä–æ–¥—É–±–ª–∏—Ä–æ–≤–∞—Ç—å —Ç–µ–∫—Å—Ç –≤–µ–¥—å –æ—Ä–∏–≥–∏–Ω–∞–ª –ø—Ä–∏—à–ª–æ—Å—å —É–¥–∞–ª–∏—Ç—å –∞ –æ—Ä–∏–≥–∏–Ω–∞–ª –ø—Ä–∏—à–ª–æ—Å—å —É–¥–∞–ª–∏—Ç—å –ø–æ—Ç–æ–º—É —á—Ç–æ –≤ –Ω—ë–º –±—ã–ª —É–ø–æ–º—è–Ω—É—Ç –º–æ–π –∑–Ω–∞–∫–æ–º—ã–π –ø–æ —Å–æ–≤–º–µ—Å—Ç–∏—Ç–µ–ª—å—Å—Ç–≤—É –ø–æ–º–æ—â–Ω–∏–∫ –º–µ—Å—Ç–Ω–æ–≥–æ –¥–µ–ø—É—Ç–∞—Ç–∞–µ–¥–∏–Ω–æ—Ä–æ—Å—Å–∞ –∏–º—è –Ω–∞–∑—ã–≤–∞—Ç—å –Ω–µ –±—É–¥—É –∏ —Ö–æ—Ç—è —á–µ–ª–æ–≤–µ–∫ –Ω–µ –ø—Ä–∏–Ω–∏–º–∞–ª —É—á–∞—Å—Ç–∏—è –≤ 

In [16]:
text_data_lemmatized = []
for text in log_progress(text_data, every=1):
    text_data_lemmatized.append(preprocess_text(text))
    #print('done')

VBox(children=(HTML(value=''), IntProgress(value=0, max=56834)))

In [17]:
text_data_lemmatized[0][:10]

['iv',
 '–º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–π',
 '—Ñ–µ—Å—Ç–∏–≤–∞–ª—å',
 '–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π',
 '–∫–∏–Ω–æ',
 '–∫–∏–Ω–∑–∞',
 '—Ç—é–º–µ–Ω—å',
 '–æ–∫—Ç—è–±—Ä—å',
 '–≥–æ–¥',
 'i']

In [18]:
data['lines_lemmatized'] = text_data_lemmatized
data.head()

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments,txt_processed,lines_lemmatized
0,66,2011-12-03 10:18:00,-1243876,"–ì–û–õ–û–°–£–ï–ú –ó–ê –ë–û–ì–£!!\n\n""–î–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è! –ì–æ–ª–æ—Å—É–π...",6,1,1,–≥–æ–ª–æ—Å—É–µ–º –∑–∞ –±–æ–≥—É –¥–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è –≥–æ–ª–æ—Å—É–π—Ç–µ –∑–∞ –º...,"[iv, –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω—ã–π, —Ñ–µ—Å—Ç–∏–≤–∞–ª—å, –¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π,..."
1,10588,2012-03-19 13:58:45,695803,"–ü–æ–∑–∞–≤—á–µ—Ä–∞ —è –Ω–∞–ø–∏—Å–∞–ª –ø–æ—Å—Ç –ø—Ä–æ —Ç–æ, –∫–∞–∫ –º—ã –∑–∞–µ—Ö–∞–ª...",8,3,16,–ø–æ–∑–∞–≤—á–µ—Ä–∞ —è –Ω–∞–ø–∏—Å–∞–ª –ø–æ—Å—Ç –ø—Ä–æ —Ç–æ –∫–∞–∫ –º—ã –∑–∞–µ—Ö–∞–ª–∏...,"[–ø–æ–∑–∞–≤—á–µ—Ä–∞, –Ω–∞–ø–∏—Å–∞—Ç—å, –ø–æ—Å—Ç, –∑–∞–µ–∑–∂–∞—Ç—å, –ø–æ–¥—Ä—É–≥–∞,..."
2,800,2012-04-10 04:55:40,102648080,–í —ç—Ç–∏ –º–∏–Ω—É—Ç—ã –∏–¥–µ—Ç –ø—Ä—è–º–∞—è —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è –ª–µ–∫—Ü–∏–∏ –ø—Ä–æ...,2,1,0,–≤ —ç—Ç–∏ –º–∏–Ω—É—Ç—ã –∏–¥–µ—Ç –ø—Ä—è–º–∞—è —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è –ª–µ–∫—Ü–∏–∏ –ø—Ä–æ...,"[–º–∏–Ω—É—Ç–∞, –∏–¥—Ç–∏, –ø—Ä—è–º–æ–π, —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è, –ª–µ–∫—Ü–∏—è, –ø—Ä–æ..."
3,5810,2012-05-07 06:49:16,70549719,12 –∞–ø—Ä–µ–ª—è 2012 –≥. –≤ –¢—é–º–µ–Ω—Å–∫–æ–º –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–º ...,1,1,0,–∞–ø—Ä–µ–ª—è –≥ –≤ —Ç—é–º–µ–Ω—Å–∫–æ–º –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω–æ–º —É–Ω–∏–≤–µ—Ä—Å–∏—Ç...,"[–∞–ø—Ä–µ–ª—å, –≥, —Ç—é–º–µ–Ω—Å–∫–∏–π, –≥–æ—Å—É–¥–∞—Ä—Å—Ç–≤–µ–Ω–Ω—ã–π, —É–Ω–∏–≤–µ—Ä..."
4,5003,2012-11-12 11:10:00,61866128,"–î–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è, –∂—É—Ä–Ω–∞–ª ""–ï–ì–û–†–ö–ê"" —É—á–∞—Å—Ç–≤—É–µ—Ç –≤ –∫–æ...",2,1,2,–¥–æ—Ä–æ–≥–∏–µ –¥—Ä—É–∑—å—è –∂—É—Ä–Ω–∞–ª –µ–≥–æ—Ä–∫–∞ —É—á–∞—Å—Ç–≤—É–µ—Ç –≤ –∫–æ–Ω–∫—É...,"[–¥–æ—Ä–æ–≥–æ–π, –¥—Ä—É–≥, –∂—É—Ä–Ω–∞–ª, –µ–≥–æ—Ä–∫–∞, —É—á–∞—Å—Ç–≤–æ–≤–∞—Ç—å, –∫..."


In [19]:
data.shape

(56834, 9)

In [20]:
data.tail(10)

Unnamed: 0,id,date,owner_id,text,likes,reposts,comments,txt_processed,lines_lemmatized
56824,35794,2019-01-13 10:12:30,-66172360,–ï–≤–≥–µ–Ω–∏—è –ë–∞—Å–∞–∫–æ–≤–∞: ¬´–†–∞–Ω–æ –ø–æ–≤–µ—Ä–∏–ª–∏ –≤ –ø–æ–±–µ–¥—É¬ª\n\n...,0,0,0,–µ–≤–≥–µ–Ω–∏—è –±–∞—Å–∞–∫–æ–≤–∞ —Ä–∞–Ω–æ –ø–æ–≤–µ—Ä–∏–ª–∏ –≤ –ø–æ–±–µ–¥—É –≤ –æ—á–µ—Ä...,"[–µ–≤–≥–µ–Ω–∏—è, –±–∞—Å–∞–∫–æ–≤, —Ä–∞–Ω–æ, –ø–æ–≤–µ—Ä—è—Ç—å, –ø–æ–±–µ–¥–∞, –æ—á–µ..."
56825,52647,2019-01-13 08:54:44,-103247095,üî•üî•üî•Maxline –í–ï–†–ù–Å–¢üî•üî•üî• (–ß–∞—Å—Ç—å 4) \n \n–ù–µ —Å—ã–≥—Ä–∞–ª–∞...,81,15,39,maxline –≤–µ—Ä–Ω—ë—Ç —á–∞—Å—Ç—å –Ω–µ —Å—ã–≥—Ä–∞–ª–∞ —Å—Ç–∞–≤–∫–∞ maxline...,"[maxline, –≤–µ—Ä–Ω—É—Ç—å, —á–∞—Å—Ç—å, —Å—ã–≥—Ä–∞—Ç—å, —Å—Ç–∞–≤–∫–∞, max..."
56826,2730,2019-01-13 07:57:56,-77960765,"[id185696918|–¢–∞—Ç—å—è–Ω–∞], –¥–∏–ø–ª–æ–º—ã –∏ –ø—Ä–∏–∑—ã –¥–ª—è –ø–æ–±...",0,0,0,–¥–∏–ø–ª–æ–º—ã –∏ –ø—Ä–∏–∑—ã –¥–ª—è –ø–æ–±–µ–¥–∏—Ç–µ–ª–µ–π –∏ –ø—Ä–∏–∑–µ—Ä–æ–≤ –∫–æ–Ω...,"[–¥–∏–ø–ª–æ–º, –ø—Ä–∏–∑, –ø–æ–±–µ–¥–∏—Ç–µ–ª—å, –ø—Ä–∏–∑–µ—Ä, –∫–æ–Ω–∫—É—Ä—Å, –ø–æ..."
56827,544,2019-01-13 07:52:23,-53025229,"–†–µ–±—è—Ç, —Å–µ—Å—Å–∏—è.",2,0,0,—Ä–µ–±—è—Ç —Å–µ—Å—Å–∏—è,"[—Ä–µ–±—è—Ç–∞, —Å–µ—Å—Å–∏—è]"
56828,440384,2019-01-13 07:47:20,-59042614,"–¢–æ—Ç –∫—Ç–æ –ö–∞—Ä—Ç—ã –Ω–∞—à—ë–ª ,–µ—Å–ª–∏ –º–æ–∂–Ω–æ –æ—Å—Ç–∞–≤—å –ø–æ–∂–∞–ª—É–π...",0,0,5,—Ç–æ—Ç –∫—Ç–æ –∫–∞—Ä—Ç—ã –Ω–∞—à—ë–ª –µ—Å–ª–∏ –º–æ–∂–Ω–æ –æ—Å—Ç–∞–≤—å –ø–æ–∂–∞–ª—É–π—Å...,"[–∫–∞—Ä—Ç–∞, –Ω–∞—Ö–æ–¥–∏—Ç—å, –æ—Å—Ç–∞–≤–ª—è—Ç—å, –ø–æ–∂–∞–ª—É–π—Å—Ç–∞, –ª—é–±–æ–π..."
56829,5579,2019-01-13 07:03:56,-8775943,"–°—Å—ã–ª–∫–∞ –Ω–∞ —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—é —Å–µ–≥–æ–¥–Ω—è—à–Ω–µ–≥–æ –º–∞—Ç—á–∞ ""–¢—é–º–µ–Ω...",7,2,1,—Å—Å—ã–ª–∫–∞ –Ω–∞ —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—é —Å–µ–≥–æ–¥–Ω—è—à–Ω–µ–≥–æ –º–∞—Ç—á–∞ —Ç—é–º–µ–Ω—å...,"[—Å—Å—ã–ª–∫–∞, —Ç—Ä–∞–Ω—Å–ª—è—Ü–∏—è, —Å–µ–≥–æ–¥–Ω—è—à–Ω–∏–π, –º–∞—Ç—á, —Ç—é–º–µ–Ω—å..."
56830,5578,2019-01-13 06:49:43,-8775943,–ï–≤–≥–µ–Ω–∏—è –ë–∞—Å–∞–∫–æ–≤–∞: ¬´–†–∞–Ω–æ –ø–æ–≤–µ—Ä–∏–ª–∏ –≤ –ø–æ–±–µ–¥—É¬ª \n\...,9,2,0,–µ–≤–≥–µ–Ω–∏—è –±–∞—Å–∞–∫–æ–≤–∞ —Ä–∞–Ω–æ –ø–æ–≤–µ—Ä–∏–ª–∏ –≤ –ø–æ–±–µ–¥—É –≤ –æ—á–µ—Ä...,"[–µ–≤–≥–µ–Ω–∏—è, –±–∞—Å–∞–∫–æ–≤, —Ä–∞–Ω–æ, –ø–æ–≤–µ—Ä—è—Ç—å, –ø–æ–±–µ–¥–∞, –æ—á–µ..."
56831,5575,2019-01-13 06:16:13,-8775943,–ê–Ω–Ω–∞ –ü–æ—Å–ø–µ–ª–æ–≤–∞: ¬´–ë—ã–ª–∞ –∫–∞–∫–∞—è-—Ç–æ –ø–∞–Ω–∏–∫–∞¬ª \n\n¬´–ü—Ä...,14,1,2,–∞–Ω–Ω–∞ –ø–æ—Å–ø–µ–ª–æ–≤–∞ –±—ã–ª–∞ –∫–∞–∫–∞—è—Ç–æ –ø–∞–Ω–∏–∫–∞ –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ ...,"[–∞–Ω–Ω–∞, –ø–æ—Å–ø–µ–ª–æ–≤, –∫–∞–∫–∞—è—Ç–æ, –ø–∞–Ω–∏–∫–∞, –ø—Ä–∏–º–æ—Ä–æ—á–∫–∞, ..."
56832,2446,2019-01-13 04:13:14,-174469560,¬´–ü—Ä–∏–º–æ—Ä–æ—á–∫–∞¬ª –ø—Ä–æ–≤–µ–ª–∞ –ø–µ—Ä–≤—É—é –≤ —ç—Ç–æ–º –≥–æ–¥—É –≤—Å—Ç—Ä–µ—á...,0,0,0,–ø—Ä–∏–º–æ—Ä–æ—á–∫–∞ –ø—Ä–æ–≤–µ–ª–∞ –ø–µ—Ä–≤—É—é –≤ —ç—Ç–æ–º –≥–æ–¥—É –≤—Å—Ç—Ä–µ—á—É ...,"[–ø—Ä–∏–º–æ—Ä–æ—á–∫–∞, –ø—Ä–æ–≤–æ–¥–∏—Ç—å, –ø–µ—Ä–≤—ã–π, –≥–æ–¥, –≤—Å—Ç—Ä–µ—á–∞, ..."
56833,35786,2019-01-12 21:11:32,-66172360,–ù–∞—Ç–∞–ª—å—è –í–∞—Å–∏–ª—å—á–µ–Ω–∫–æ: ¬´–ù–∞—Å –ø–æ–¥–≤–µ–ª–∏ —Ç–æ–Ω–∫–æ—Å—Ç–∏ –ø—Å–∏...,0,0,0,–Ω–∞—Ç–∞–ª—å—è –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ –Ω–∞—Å –ø–æ–¥–≤–µ–ª–∏ —Ç–æ–Ω–∫–æ—Å—Ç–∏ –ø—Å–∏—Ö–æ...,"[–Ω–∞—Ç–∞–ª—å—è, –≤–∞—Å–∏–ª—å—á–µ–Ω–∫–æ, –ø–æ–¥–≤–æ–¥–∏—Ç—å, —Ç–æ–Ω–∫–æ—Å—Ç—å, –ø—Å..."


In [21]:
data.to_csv('vk_posts_2011-2020_lemmatized.csv', encoding='utf-8')