In [7]:
import gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel

## Download data

Groups full data

In [8]:
groups_df = pd.read_csv('data/groups.csv', low_memory=False)
groups_df = groups_df.drop(columns=['Unnamed: 0', 'name', 'status', 'description', 'activity', 'members_count'])

In [9]:
groups_df

Unnamed: 0,user_id,group_id
0,145,34168005
1,145,34483558
2,145,63133420
3,145,62524274
4,145,212382
...,...,...
6749302,638374890,301419447
6749303,638438887,29425083
6749304,638438887,25882042
6749305,638438887,122152852


Groups texts with nouns only

In [10]:
processed_groups = pd.read_csv('data/processed/bspb_groups_nouns.csv')
processed_groups['clear_texts'] = processed_groups['nouns_name'] + ' ' + processed_groups['nouns_description']

# Remove NaN values
processed_groups = processed_groups.loc[processed_groups['clear_texts'].notna()]

In [11]:
processed_groups

Unnamed: 0.1,Unnamed: 0,id,name,status,description,activity,members_count,clear_name,clear_description,nouns_name,nouns_description,clear_texts
0,0,1,–í–ö–æ–Ω—Ç–∞–∫—Ç–µ API,–ü–ª–∞—Ç—Ñ–æ—Ä–º–∞ –¥–ª—è —Ä–∞–∑—Ä–∞–±–æ—Ç—á–∏–∫–æ–≤ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π,–ì—Ä—É–ø–ø–∞ –ø–æ—Å–≤—è—â–µ–Ω–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–µ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–π –Ω–∞ –æ—Å–Ω–æ...,Open group,383414.0,–≤–∫–æ–Ω—Ç–∞–∫—Ç–µ api,–≥—Ä—É–ø–ø–∞ –ø–æ—Å–≤—è—â–∞—Ç—å —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –æ—Å–Ω–æ–≤–∞ ...,–≤–∫–æ–Ω—Ç–∞–∫—Ç–µ,–≥—Ä—É–ø–ø–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –æ—Å–Ω–æ–≤–∞ –ø–ª–∞—Ç—Ñ–æ—Ä–º–∞ ...,–≤–∫–æ–Ω—Ç–∞–∫—Ç–µ –≥—Ä—É–ø–ø–∞ —Ä–∞–∑—Ä–∞–±–æ—Ç–∫–∞ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–µ –æ—Å–Ω–æ–≤–∞ ...
1,1,113246211,"–°–∞–π—Ç, –≤–∏–¥–µ–æ–º–æ–Ω—Ç–∞–∂, –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ –¥–ª—è —É—á–∏—Ç–µ–ª–µ–π, –≤–æ—Å–ø.","–ó–∞–∫–∞–∑ –ü–û–†–¢–§–û–õ–ò–û, –≤–∏–¥–µ–æ–ø–æ–∑–¥—Ä–∞–≤–ª–µ–Ω–∏–π, –≤–∏–¥–µ–æ–ø—Ä–µ–∑–µ...","–í—ã–ø–æ–ª–Ω—è—é —Ä–∞–±–æ—Ç—ã –ø–æ —Å–æ–∑–¥–∞–Ω–∏—é —Å–∞–π—Ç–æ–≤, –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ,...",Other services,225.0,—Å–∞–π—Ç –≤–∏–¥–µ–æ–º–æ–Ω—Ç–∞–∂ –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ —É—á–∏—Ç–µ–ª—å –≤–æ—Å–ø–∞,–≤—ã–ø–æ–ª–Ω—è—Ç—å —Ä–∞–±–æ—Ç–∞ —Å–æ–∑–¥–∞–Ω–∏–µ —Å–∞–π—Ç –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ –≤–∏–¥–µ–æ...,—Å–∞–π—Ç –≤–∏–¥–µ–æ–º–æ–Ω—Ç–∞–∂ –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ —É—á–∏—Ç–µ–ª—å –≤–æ—Å–ø–∞,—Ä–∞–±–æ—Ç–∞ —Å–æ–∑–¥–∞–Ω–∏–µ —Å–∞–π—Ç –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ –≤–∏–¥–µ–æ—Ä–æ–ª–∏–∫ –≤–∏–¥–µ...,—Å–∞–π—Ç –≤–∏–¥–µ–æ–º–æ–Ω—Ç–∞–∂ –ø–æ—Ä—Ç—Ñ–æ–ª–∏–æ —É—á–∏—Ç–µ–ª—å –≤–æ—Å–ø–∞ —Ä–∞–±–æ—Ç...
2,2,180355074,–û—Ä–µ—à–∫–∏–Ω –ª–µ—Å.–£–∫—Ä–∞—à–µ–Ω–∏—è –∏–∑ –¥–µ—Ä–µ–≤–∞ –æ—Ç –ê–ª–µ–∫—Å–∞–Ω–¥—Ä–∏–Ω—ã.,–ü—Ä–∏–≤–µ—Ç.\t–ú–µ–Ω—è –∑–æ–≤—É—Ç –ê–ª–µ–∫—Å–∞–Ω–¥—Ä–∏–Ω–∞ .–Ø —Å–æ–∑–¥–∞—é —É–∫—Ä...,"–ñ–∏—Ç–µ–ª–∏ —ç—Ç–æ–≥–æ –ª–µ—Å–∞ - —É–∫—Ä–∞—à–µ–Ω–∏—è ,–æ–Ω–∏ —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞—é...",Artist,1929.0,–æ—Ä–µ—à–∫–∏–Ω –ª–µ—Å —É–∫—Ä–∞—à–µ–Ω–∏–µ –¥–µ—Ä–µ–≤–æ –∞–ª–µ–∫—Å–∞–Ω–¥—Ä–∏–Ω–∞,–∂–∏—Ç–µ–ª—å –ª–µ—Å —É–∫—Ä–∞—à–µ–Ω–∏–µ —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞—Ç—å –∏—Å—Ç–æ—Ä–∏—è –ª–µ–≥–µ...,–æ—Ä–µ—à–∫–∏–Ω –ª–µ—Å —É–∫—Ä–∞—à–µ–Ω–∏–µ –¥–µ—Ä–µ–≤–æ –∞–ª–µ–∫—Å–∞–Ω–¥—Ä–∏–Ω–∞,–∂–∏—Ç–µ–ª—å –ª–µ—Å —É–∫—Ä–∞—à–µ–Ω–∏–µ –∏—Å—Ç–æ—Ä–∏—è –ª–µ–≥–µ–Ω–¥–∞ –ª—é–±–æ–≤—å —Ç–µ...,–æ—Ä–µ—à–∫–∏–Ω –ª–µ—Å —É–∫—Ä–∞—à–µ–Ω–∏–µ –¥–µ—Ä–µ–≤–æ –∞–ª–µ–∫—Å–∞–Ω–¥—Ä–∏–Ω–∞ –∂–∏—Ç–µ...
9,9,192937987,–ì—Ä–∏–Ω–≤—ç–π –ö—Ä–æ–Ω—à—Ç–∞–¥—Ç,,‚òòÔ∏è GreenWay - —ç—Ç–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ –∏ –¥–æ—Å—Ç—É–ø–Ω—ã–µ –≠–ö–û -...,Household Products,105.0,–≥—Ä–∏–Ω–≤—ç–π –∫—Ä–æ–Ω—à—Ç–∞–¥—Ç,__u2618__ greenway —ç—Ç–æ —É–Ω–∏–∫–∞–ª—å–Ω—ã–π –¥–æ—Å—Ç—É–ø–Ω—ã–π —ç–∫...,–≥—Ä–∏–Ω–≤—ç–π –∫—Ä–æ–Ω—à—Ç–∞–¥—Ç,—ç–∫–æ –ø—Ä–æ–¥—É–∫—Ç –∂–∏–∑–Ω—å –ø–æ–º–æ—â—å –Ω–∞–Ω–æ—Ç–µ—Ö–Ω–æ–ª–æ–≥–∏—è —á–∏—Å—Ç–æ—Ç...,–≥—Ä–∏–Ω–≤—ç–π –∫—Ä–æ–Ω—à—Ç–∞–¥—Ç —ç–∫–æ –ø—Ä–æ–¥—É–∫—Ç –∂–∏–∑–Ω—å –ø–æ–º–æ—â—å –Ω–∞–Ω...
11,11,14,–î–æ–º –ö—É–ª—å—Ç—É—Ä—ã,–î–æ–º –∫—É–ª—å—Ç—É—Ä—ã,–ê–Ω–æ–Ω—Å—ã –∫—É–ª—å—Ç—É—Ä–Ω—ã—Ö —Å–æ–±—ã—Ç–∏–π –ü–µ—Ç–µ—Ä–±—É—Ä–≥–∞ (–∏ –Ω–µ–º–Ω–æ–∂...,Community Center,5991.0,–¥–æ–º –∫—É–ª—å—Ç—É—Ä–∞,–∞–Ω–æ–Ω—Å –∫—É–ª—å—Ç—É—Ä–Ω—ã–π —Å–æ–±—ã—Ç–∏–µ –ø–µ—Ç–µ—Ä–±—É—Ä–≥ –Ω–µ–º–Ω–æ–∂–∫–æ –º–æ...,–¥–æ–º –∫—É–ª—å—Ç—É—Ä–∞,–∞–Ω–æ–Ω—Å —Å–æ–±—ã—Ç–∏–µ –ø–µ—Ç–µ—Ä–±—É—Ä–≥ –º–æ—Å–∫–≤–∞,–¥–æ–º –∫—É–ª—å—Ç—É—Ä–∞ –∞–Ω–æ–Ω—Å —Å–æ–±—ã—Ç–∏–µ –ø–µ—Ç–µ—Ä–±—É—Ä–≥ –º–æ—Å–∫–≤–∞
...,...,...,...,...,...,...,...,...,...,...,...,...
912471,912471,64449308,–ú—É–∑–µ–π–Ω—ã–π —Ü–µ–Ω—Ç—Ä –í–∞–ø—Ä–∏–∏–∫–∫–∏ | –¢–∞–º–ø–µ—Ä–µ | –§–∏–Ω–ª—è–Ω–¥–∏—è,–ú—É–∑–µ–π | –°–∞–π—Ç –º—É–∑–µ—è –Ω–∞ —Ä—É—Å—Å–∫–æ–º —è–∑—ã–∫–µ: www.vapri...,–ú—É–∑–µ–π–Ω—ã–π —Ü–µ–Ω—Ç—Ä –í–∞–ø—Ä–∏–∏–∫–∫–∏ –≤ —Ü–µ–Ω—Ç—Ä–µ –≥–æ—Ä–æ–¥–∞ –¢–∞–º–ø–µ...,"Museum, gallery, exhibition",183.0,–º—É–∑–µ–π–Ω—ã–π —Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏–∫–∫–∏ —Ç–∞–º–ø–µ—Ä–µ —Ñ–∏–Ω–ª—è–Ω–¥–∏—è,–º—É–∑–µ–π–Ω—ã–π —Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏–∫–∫–∏ —Ü–µ–Ω—Ç—Ä –≥–æ—Ä–æ–¥ —Ç–∞–º–ø–µ—Ä–µ –ø...,—Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏–∫–∫–∏ —Ç–∞–º–ø–µ—Ä–µ —Ñ–∏–Ω–ª—è–Ω–¥–∏—è,—Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏–∫–∫–∏ —Ü–µ–Ω—Ç—Ä –≥–æ—Ä–æ–¥ —Ç–∞–º–ø–µ—Ä–µ —Å–µ–º—å—è –±–∏–ª–µ...,—Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏–∫–∫–∏ —Ç–∞–º–ø–µ—Ä–µ —Ñ–∏–Ω–ª—è–Ω–¥–∏—è —Ü–µ–Ω—Ç—Ä –≤–∞–ø—Ä–∏–∏...
912476,912476,104857566,"–ü–†–û–î–ê–ñ–ê,–ü–û–ö–£–ü–ö–ê –∏ –û–ë–ú–ï–ù –ê–í–¢–û–ú–û–ë–ò–õ–ï–ô .(AUTOHOUSE)",–¢–µ–ª. 8-953-231-48-00,"- –û—Ñ–æ—Ä–º–ª–µ–Ω–∏–µ, –¥–æ–≥–æ–≤–æ—Ä–∞ –∫—É–ø–ª–∏-–ø—Ä–æ–¥–∞–∂–∏ –≤ –ª—é–±–æ–µ –≤...",Transport,135.0,–ø—Ä–æ–¥–∞–∂–∞ –ø–æ–∫—É–ø–∫–∞ –æ–±–º–µ–Ω –∞–≤—Ç–æ–º–æ–±–∏–ª—å autohouse,–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏–µ –¥–æ–≥–æ–≤–æ—Ä –∫—É–ø–ª—è –ø—Ä–æ–¥–∞–∂–∞ –ª—é–±–æ–π –≤—Ä–µ–º—è –ø...,–ø—Ä–æ–¥–∞–∂–∞ –ø–æ–∫—É–ø–∫–∞ –æ–±–º–µ–Ω –∞–≤—Ç–æ–º–æ–±–∏–ª—å,–æ—Ñ–æ—Ä–º–ª–µ–Ω–∏–µ –¥–æ–≥–æ–≤–æ—Ä –∫—É–ø–ª—è –ø—Ä–æ–¥–∞–∂–∞ –≤—Ä–µ–º—è –ø—Ä–æ–¥–∞–∂–∞...,–ø—Ä–æ–¥–∞–∂–∞ –ø–æ–∫—É–ø–∫–∞ –æ–±–º–µ–Ω –∞–≤—Ç–æ–º–æ–±–∏–ª—å –æ—Ñ–æ—Ä–º–ª–µ–Ω–∏–µ –¥–æ...
912480,912480,83908177,–ü—Ä–æ–¥—É–∫—Ç—ã —Å —Ñ–µ—Ä–º—ã,–§–µ—Ä–º–µ—Ä—Å–∫–∏–µ –ø—Ä–æ–¥—É–∫—Ç—ã,–ù–∞—Ç—É—Ä–∞–ª—å–Ω—ã–µ –ø—Ä–æ–¥—É–∫—Ç—ã —Å —Ñ–µ—Ä–º—ã\n\n–ú—ã –æ–±–µ—Å–ø–µ—á–∏–≤–∞–µ...,Internet store,847.0,–ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞,–Ω–∞—Ç—É—Ä–∞–ª—å–Ω—ã–π –ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞ –æ–±–µ—Å–ø–µ—á–∏–≤–∞—Ç—å —Å–≤–æ–π –∫–ª...,–ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞,–ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞ –∫–ª–∏–µ–Ω—Ç –ø—Ä–æ–¥—É–∫—Ü–∏—è —Å—ã—Ä —Å—É–ª—É–≥—É–Ω–∏ —Å—ã...,–ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞ –ø—Ä–æ–¥—É–∫—Ç —Ñ–µ—Ä–º–∞ –∫–ª–∏–µ–Ω—Ç –ø—Ä–æ–¥—É–∫—Ü–∏—è —Å...
912485,912485,54525929,–ë–∞–π–∫–æ–Ω—É—Ä | –ù–æ–≤–æ—Å—Ç–∏ | –û–±—ä—è–≤–ª–µ–Ω–∏—è - 94info,–í—Å–µ –æ–±—ä—è–≤–ª–µ–Ω–∏—è –ë–∞–π–∫–æ–Ω—É—Ä–∞ –≤ –æ–¥–Ω–æ–º –º–µ—Å—Ç–µ!,–î–ª—è —Ä–∞–∑–º–µ—â–µ–Ω–∏—è –æ–±—ä—è–≤–ª–µ–Ω–∏—è –≤ –≥—Ä—É–ø–ø–µ –Ω–∞–∂–∏–º–∞–µ–º ''...,City community,47036.0,–±–∞–π–∫–æ–Ω—É—Ä –Ω–æ–≤–æ—Å—Ç—å –æ–±—ä—è–≤–ª–µ–Ω–∏–µ,—Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –æ–±—ä—è–≤–ª–µ–Ω–∏–µ –≥—Ä—É–ø–ø–∞ –Ω–∞–∂–∏–º–∞—Ç—å __u0001f...,–±–∞–π–∫–æ–Ω—É—Ä –Ω–æ–≤–æ—Å—Ç—å –æ–±—ä—è–≤–ª–µ–Ω–∏–µ,—Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –æ–±—ä—è–≤–ª–µ–Ω–∏–µ –≥—Ä—É–ø–ø–∞ –∑–∞–ø–∏—Å—å –ø—Ä–∞–≤–∏–ª–æ —Ä–µ...,–±–∞–π–∫–æ–Ω—É—Ä –Ω–æ–≤–æ—Å—Ç—å –æ–±—ä—è–≤–ª–µ–Ω–∏–µ —Ä–∞–∑–º–µ—â–µ–Ω–∏–µ –æ–±—ä—è–≤–ª–µ...


Posts and reposts

In [12]:
processed_reposts = pd.read_csv('data/processed/bspb_reposts_nouns.csv')
processed_posts = pd.read_csv('data/processed/bspb_posts_nouns.csv')

# Remove NaN values
processed_reposts = processed_reposts.loc[processed_reposts['nouns'].notna()]
processed_posts = processed_posts.loc[processed_posts['nouns'].notna()]

In [13]:
processed_reposts

Unnamed: 0.1,Unnamed: 0,owner_id,post_id,text,clear_text,nouns
0,0,273694109,826,–†–û–°–°–ò–Ø: –û–°–¢–ê–ù–û–í–ò–¢–¨ –ó–ê–ö–û–ù–û–ü–†–û–ï–ö–¢ –û –ü–†–ò–ù–£–î–ò–¢–ï–õ–¨–ù...,—Ä–æ—Å—Å–∏—è –æ—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞—Ç—å –∑–∞–∫–æ–Ω–æ–ø—Ä–æ–µ–∫—Ç –ø—Ä–∏–Ω—É–¥–∏—Ç–µ–ª—å–Ω...,—Ä–æ—Å—Å–∏—è –∑–∞–∫–æ–Ω–æ–ø—Ä–æ–µ–∫—Ç –≤–∞–∫—Ü–∏–Ω–∞—Ü–∏—è –Ω–æ—è–±—Ä—å –≥–æ—Å –¥—É–º–∞...
1,1,116524602,10482,–°–ø–∏co–∫ ca–º—ã—Ö —ç—Ñ—Ñe–∫—Ç–∏–≤–Ω—ã—Ö –ªe–∫a—Äc—Ç–≤ –Ωa —Äa–∑–Ω—ãe c–ª...,—Å–ø–∏—Å–æ–∫ —Å–∞–º—ã–π —ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã–π –ª–µ–∫–∞—Ä—Å—Ç–≤–æ —Ä–∞–∑–Ω—ã–π —Å–ª—É—á...,—Å–ø–∏—Å–æ–∫ –ª–µ–∫–∞—Ä—Å—Ç–≤–æ —Å–ª—É—á–∞–π –∂–∏–∑–Ω—å
2,2,15835662,1127,"–ö–æ–º—É –º–µ–¥–∞? –µ—â–µ –æ—Å—Ç–∞–ª—Å—è, –ø—Ä—è–º–∏–∫–æ–º –∏–∑ –¢–∞–π–≥–∏.\n–ó–∞...",–º–µ–¥ –æ—Å—Ç–∞–≤–∞—Ç—å—Å—è –ø—Ä—è–º–∏–∫–æ–º —Ç–∞–π–≥–∞ –∫–∞—á–µ—Å—Ç–≤–æ —Ä—É—á–∞—Ç—å—Å...,–º–µ–¥ —Ç–∞–π–≥–∞ –∫–∞—á–µ—Å—Ç–≤–æ –ª–∏—á–∫–∞
3,3,31868444,6676,22 –ú–ê–Ø | 20:00\n–¢–∞–Ω—Ü–µ–≤–∞–ª—å–Ω—ã–π —Å–ø–µ–∫—Ç–∞–∫–ª—å ¬´–ø–ª–∞—á/—å...,–º–∞–π —Ç–∞–Ω—Ü–µ–≤–∞–ª—å–Ω—ã–π —Å–ø–µ–∫—Ç–∞–∫–ª—å –ø–ª–∞—á —å –±–∏–ª–µ—Ç dancek...,–º–∞–π —Å–ø–µ–∫—Ç–∞–∫–ª—å –ø–ª–∞—á –±–∏–ª–µ—Ç —Å—á–∞—Å—Ç–∏–µ —Å—á–∞—Å—Ç–∏–µ —Å–≤–æ–±–æ...
4,4,31868444,6673,"–°–µ–≥–æ–¥–Ω—è –¥–µ–Ω—å —Ä–æ–∂–¥–µ–Ω–∏—è [id31868444|–î–∞—à–∏], –Ω–∞—à–µ–≥...",—Å–µ–≥–æ–¥–Ω—è –¥–µ–Ω—å —Ä–æ–∂–¥–µ–Ω–∏–µ –¥–∞—à–∞ –Ω–∞—à —Ñ–∏—Ç–Ω–µ—Å —Ç—Ä–µ–Ω–µ—Ä –¥...,–¥–µ–Ω—å —Ä–æ–∂–¥–µ–Ω–∏–µ –¥–∞—à–∞ —Ñ–∏—Ç–Ω–µ—Å —Ç—Ä–µ–Ω–µ—Ä —Ä–∞—Å—Ç—è–∂–∫–∞ —É—Ç—Ä–æ...
...,...,...,...,...,...,...
6390,6390,554424,5189,–î–∞–≤–∞–π—Ç–µ —Å–ø–∞—Å–∞—Ç—å –ø–ª–∞–Ω–µ—Ç—É!,–¥–∞–≤–∞—Ç—å —Å–ø–∞—Å–∞—Ç—å –ø–ª–∞–Ω–µ—Ç–∞,–ø–ª–∞–Ω–µ—Ç–∞
6393,6393,1824199,3202,–ü–æ–¥–∞—Ä–æ—á–Ω—ã–π —Å–µ—Ä—Ç–∏—Ñ–∏–∫–∞—ÇüèÉ‚ÄçüèÉ‚ÄçüèÉ‚Äç\n\n22 –¥–µ–∫–∞–±—Ä—è ‚Äì –∫–∞...,–ø–æ–¥–∞—Ä–æ—á–Ω—ã–π —Å–µ—Ä—Ç–∏—Ñ–∏–∫–∞—Ç __u0001f3c3__ __u0001f3c...,—Å–µ—Ä—Ç–∏—Ñ–∏–∫–∞—Ç –¥–µ–∫–∞–±—Ä—å –∫–∞—Ç–∞—Å—Ç—Ä–æ—Ñ–∞ –≥–æ—Ä–æ–¥ —Å—É–≤–µ–Ω–∏—Ä –ø–∞...
6394,6394,267284787,396,–£—á–∏—Å—å —Å—Ç—É–¥–µ–Ω—Ç,—É—á–∏—Ç—å—Å—è —Å—Ç—É–¥–µ–Ω—Ç,—Å—Ç—É–¥–µ–Ω—Ç
6395,6395,17665225,1333,–ë–µ–¥–∞ –Ω–µ –ø—Ä–∏—Ö–æ–¥–∏—Ç –æ–¥–Ω–∞ üòÜ,–±–µ–¥–∞ –ø—Ä–∏—Ö–æ–¥–∏—Ç—å __u0001f606__,–±–µ–¥–∞


In [14]:
processed_posts

Unnamed: 0.1,Unnamed: 0,from_id,post_id,text,clear_text,nouns
1,1,145,2292,"–°–∫–æ–ª—å–∫–æ –∂–µ –≤—Å–µ–≥–æ –∏–∑–º–µ–Ω–∏–ª–æ—Å—å –∑–∞ —ç—Ç–æ –≤—Ä–µ–º—è, 7 –ª–µ...",—Å–∫–æ–ª—å–∫–æ –∏–∑–º–µ–Ω—è—Ç—å—Å—è –≤—Ä–µ–º—è –≥–æ–¥ –ø—Ä–æ—Ö–æ–¥–∏—Ç—å —Ä–æ–∂–¥–∞—Ç—å...,–≤—Ä–µ–º—è –≥–æ–¥ —Å—ã–Ω –±–∏–∑–Ω–µ—Å –≤–µ—â—å —á–µ–ª—Å–∏ —Å–µ—Ä–¥—Ü–µ –∫–ª—É–± –∏–≥...
2,2,145,2292,"–ñ–∞–ª—å, —á—Ç–æ –∏–∑-–∑–∞ –ø–æ–ª–∏—Ç–∏–∫–∏ –ê–±—Ä–∞–º–æ–≤–∏—á–∞ –∫–ª—É–± —Ç–∞–∫ –∏...",–∂–∞–ª—å –ø–æ–ª–∏—Ç–∏–∫–∞ –∞–±—Ä–∞–º–æ–≤–∏—á –∫–ª—É–± –Ω–∞—á–∏–Ω–∞—Ç—å –∏–≥—Ä–∞—Ç—å —Å...,–ø–æ–ª–∏—Ç–∏–∫–∞ –∞–±—Ä–∞–º–æ–≤–∏—á –∫–ª—É–± –Ω–∞–¥–µ–∂–¥–∞ —á–µ–ª—Å–∏ —Å–ª–∞–≤–∞
3,3,145,2290,–ü–æ–ª—Ç–æ—Ä–∞ –≥–æ–¥–∞ –Ω–∞–∑–∞–¥ –∏—Å–∫–∞–ª —Ä–µ—à–µ–Ω–∏–µ –Ω–µ–∫–æ—Ç–æ—Ä—ã—Ö —Å–≤–æ...,–ø–æ–ª—Ç–æ—Ä–∞ –≥–æ–¥ –Ω–∞–∑–∞–¥ –∏—Å–∫–∞—Ç—å —Ä–µ—à–µ–Ω–∏–µ –Ω–µ–∫–æ—Ç–æ—Ä—ã–π —Å–≤–æ...,–≥–æ–¥ —Ä–µ—à–µ–Ω–∏–µ –ø—Ä–æ–±–ª–µ–º–∞ –ø—Ä–æ—Å—Ç–æ—Ä –∏–Ω—Ç–µ—Ä–Ω–µ—Ç –≤–∏–∫—Ç–æ—Ä –ø...
4,4,145,2290,–ù–∞ –¥–∞–Ω–Ω—ã–π –º–æ–º–µ–Ω—Ç –¥—Ä—É–∂–∏–º —Å–µ–º—å—è–º–∏ –∏ –±–µ–∑—É–º–Ω–æ –±–ª–∞–≥...,–¥–∞–Ω–Ω—ã–π –º–æ–º–µ–Ω—Ç –¥—Ä—É–∂–∏—Ç—å —Å–µ–º—å—è –±–µ–∑—É–º–Ω–æ –±–ª–∞–≥–æ–¥–∞—Ä–Ω—ã...,–º–æ–º–µ–Ω—Ç —Å–µ–º—å—è —Å—É–¥—å–±–∞ —á–µ–ª–æ–≤–µ–∫
5,5,145,2290,–ö —Å–æ–∂–∞–ª–µ–Ω–∏—é –Ω–µ –∏–º–µ—é –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏ –ø–æ–∑–¥—Ä–∞–≤–∏—Ç—å –ª–∏—á...,—Å–æ–∂–∞–ª–µ–Ω–∏–µ –∏–º–µ—Ç—å –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –ø–æ–∑–¥—Ä–∞–≤–ª—è—Ç—å –ª–∏—á–Ω–æ ...,—Å–æ–∂–∞–ª–µ–Ω–∏–µ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –≥–æ–¥ –∂–∏–∑–Ω—å —Ä–µ–±–µ–Ω–æ–∫ —É—Å–ø–µ—Ö
...,...,...,...,...,...,...
810835,810835,619951213,49,–æ–Ω–∞ –≤–º–µ—Å—Ç–∏—Ç —Å–µ–±–µ –≤ –≥–ª–æ—Ç–∫—É —Ä–µ–≤–æ–ª—å–≤–µ—Ä,–≤–º–µ—â–∞—Ç—å –≥–ª–æ—Ç–∫–∞ —Ä–µ–≤–æ–ª—å–≤–µ—Ä,–≥–ª–æ—Ç–∫–∞ —Ä–µ–≤–æ–ª—å–≤–µ—Ä
810836,810836,619951213,47,–æ–≥–æ–Ω—å –¥–∞ –º—è—Å–æ,–æ–≥–æ–Ω—å –º—è—Å–æ,–æ–≥–æ–Ω—å –º—è—Å–æ
810837,810837,619951213,35,–¢–≤–æ–π –∞—Ä–æ–º–∞—Ç.,—Ç–≤–æ–π –∞—Ä–æ–º–∞—Ç,–∞—Ä–æ–º–∞—Ç
810838,810838,620178523,449,–í–≤–æ–¥–∏ –∫–æ–¥ 6s2jq8 –∏ –ø–æ–ª—É—á–∞–π 5 —Ä—É–±!,–≤–≤–æ–¥–∏—Ç—å –∫–æ–¥ –ø–æ–ª—É—á–∞—Ç—å —Ä—É–±,–∫–æ–¥ —Ä—É–±


## Analyze users

In [17]:
lda_model_groups = gensim.models.ldamodel.LdaModel.load('models/lda_model_groups_notna_13.model')
lda_model_posts_reposts = gensim.models.ldamodel.LdaModel.load('models/lda_model_posts_reposts_nouns_notna_12.model')

In [18]:
# An auxiliary function for getting corpus from DF
def get_corpus(series):
    texts_list = [str(values).split() for values in series]
    id2word = corpora.Dictionary(texts_list)
    corpus = [id2word.doc2bow(text) for text in texts_list]
    return corpus

In [19]:
# Function for getting vectors of user's activities and opinions (posts and reposts) and interests (groups)
def analyze_user(user_id):
    
    user_posts = processed_posts.loc[processed_posts['from_id'] == int(user_id)]
    user_reposts = processed_reposts.loc[processed_reposts['owner_id'] == int(user_id)]
    user_posts_reposts = pd.concat([user_posts['nouns'], user_reposts['nouns']])
    if len(user_posts_reposts) > 0:
        posts_reposts_corpus = get_corpus(user_posts_reposts)
        lda_model_activities = lda_model_posts_reposts
        posts_reposts_topics = lda_model_activities[posts_reposts_corpus]
        posts_reposts_topics = posts_reposts_topics[0]
        
    else:
        posts_reposts_topics = [(0, None),
                                (1, None),
                                (2, None),
                                (3, None),
                                (4, None),
                                (5, None),
                                (6, None),
                                (7, None),
                                (8, None),
                                (9, None),
                                (10, None),
                                (11, None)]
    
    user_groups_list = list(groups_df.loc[groups_df['user_id'] == str(user_id)]['group_id'].values)
    user_groups_list = [int(group) for group in user_groups_list]
    user_groups = processed_groups.loc[processed_groups['id'].isin(user_groups_list)]
    if len(user_groups) > 0:    
        groups_corpus = get_corpus(user_groups['clear_texts'])
        lda_model_interests = lda_model_groups
        groups_topics = lda_model_interests[groups_corpus]
        groups_topics = groups_topics[0]
    else:
        groups_topics = [(0, None),
                                (1, None),
                                (2, None),
                                (3, None),
                                (4, None),
                                (5, None),
                                (6, None),
                                (7, None),
                                (8, None),
                                (9, None),
                                (10, None),
                                (11, None),
                                (12, None)]
    return posts_reposts_topics, groups_topics

In [20]:
analyze_user(17039363)[1][0]

[(0, 0.020074721),
 (1, 0.019182287),
 (2, 0.30670038),
 (4, 0.5829292),
 (9, 0.018949088),
 (12, 0.036696296)]

### Collect set of users

In [21]:
users_with_posts = set(processed_posts['from_id'])

# Convert ids into int
users_with_posts = [int(user_id) for user_id in users_with_posts]
len(users_with_posts)

12546

In [22]:
users_with_reposts = set(processed_reposts['owner_id'])

# Convert ids into int
users_with_reposts = [int(user_id) for user_id in users_with_reposts]
len(users_with_reposts)

2580

In [38]:
users_with_groups = set(groups_df['user_id'])
users_with_groups = list(users_with_groups)

# Remove nan
users_with_groups = users_with_groups[1:]
# Remove user_id
user_id_ind = users_with_groups.index('user_id')
users_with_groups.pop(10228)

# Convert ids into int
users_with_groups = [int(user_id) for user_id in users_with_groups]
len(users_with_groups)

28970

Final set of users

In [39]:
users = (set(users_with_posts).union(set(users_with_reposts))).union(set(users_with_groups))
len(users)

28997

In [271]:
topics_posts_reposts = []
topics_groups = []

In [None]:
for u in users:  
    t_posts_reposts, t_groups = analyze_user(u)
    topics_posts_reposts.append(t_posts_reposts)
    topics_groups.append(t_groups)

In [282]:
POSTS_REPOSTS_MODEL_TOPICS_NUM = 12
GROUPS_MODEL_TOPICS_NUM = 13

In [285]:
final_df = pd.DataFrame()
final_df['user_id'] = users

for topic_n in range(POSTS_REPOSTS_MODEL_TOPICS_NUM):
    tops_n = []
    for topic in topics_posts_reposts:
        flag = False
        
        if len(topic) != POSTS_REPOSTS_MODEL_TOPICS_NUM:
            topic = topic[0]
        for item in topic:
            if item[0] == topic_n:
                tops_n.append(item[1])
                flag = True
        
        if not flag:
            tops_n.append(None)

    final_df[f'posts_reposts_{topic_n}'] = tops_n

for topic_n in range(GROUPS_MODEL_TOPICS_NUM):
    tops_n = []
    for topic in topics_groups:
        flag = False
        
        if len(topic) != GROUPS_MODEL_TOPICS_NUM:
            topic = topic[0]
        for item in topic:
            if item[0] == topic_n:
                tops_n.append(item[1])
                flag = True
        
        if not flag:
            tops_n.append(None)

    final_df[f'groups_{topic_n}'] = tops_n

In [286]:
final_df

Unnamed: 0,user_id,posts_reposts_0,posts_reposts_1,posts_reposts_2,posts_reposts_3,posts_reposts_4,posts_reposts_5,posts_reposts_6,posts_reposts_7,posts_reposts_8,...,groups_3,groups_4,groups_5,groups_6,groups_7,groups_8,groups_9,groups_10,groups_11,groups_12
0,17489295,0.053998,0.040320,0.061113,0.062024,0.053151,0.067862,0.314564,0.059733,0.077446,...,,0.456763,,,,0.017124,,,,
1,91020,0.058680,0.043816,0.066413,0.067403,0.057760,0.073747,0.255121,0.064913,0.084162,...,,0.629385,,,,,0.041771,,,
2,17039363,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,,0.581425,,,,,0.018878,,,0.038319
3,2430764,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,,0.647599,,,,0.011934,,,,
4,76546053,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,0.064162,0.354159,,,,,0.049147,,,0.058909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41631,177864684,0.053998,0.040320,0.061113,0.062024,0.053151,0.067862,0.314564,0.059733,0.077446,...,0.014069,0.296296,0.012418,0.013902,0.013982,0.030362,0.017184,,0.017058,0.016408
41632,18117179,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,0.020940,0.029172,0.018496,0.020703,0.020822,0.044191,0.025595,0.014819,0.025332,0.024431
41633,24248311,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,,0.647638,,,,0.011934,,,,
41634,1216919,0.064255,0.047979,0.072723,0.073806,0.063248,0.080753,0.184357,0.071080,0.092158,...,,0.662201,,,,0.012472,,,,


In [287]:
final_df.to_csv('users_activities_groups.csv')