<a href="https://colab.research.google.com/github/Newborn1937/zdetect/blob/main/vk_group_members_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Initialization

In [1]:
# Install vk api
!pip -q install vk==2.0.2

import gdown

# Load utility for high-level interaction with vk
url = 'https://raw.githubusercontent.com/Newborn1937/zdetect/main/vk_utils.py'
gdown.download(url, quiet=True)

# Patch the installed vk/utils.py to import from collections.abc
!sed -i 's/from collections import Iterable/from collections.abc import Iterable/' /usr/local/lib/python3.12/dist-packages/vk/utils.py

import importlib, sys
importlib.invalidate_caches()

import vk_utils

# Initialize vk api
vk = vk_utils.VkAPI()

import datetime
import os
from pathlib import Path
import pickle
import re

from google.colab import drive
from google.colab import files
import numpy as np
import pandas as pd
import plotly.express as px
import urllib
from tqdm import tqdm

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for vk (setup.py) ... [?25l[?25hdone


  templ_str = 'ya:created dc:date="([\d]+-[\d]+-[\d]+)T'


#### Settings

In [2]:
SRC_DATA_URL = 'GDRIVE_LINK'
DIRPATH_ROOT_LOCAL = '/content/data/vk_group_members'

In [3]:
!gdown -q --fuzzy --folder "$SRC_DATA_URL" -O "$DIRPATH_ROOT_LOCAL"

#### Groups to analyze

In [4]:
# Put group names or indices here
#group_idx = ['russia', 'zogolovok']
group_idx = ['putin_z', 'vladimir_vladimirovichp', 'putin_govorit', 'putintoday',
             'putin_vs_veteran', 'moskvaputinu', 'putin_2014', 'putin_lider',
             'moy_putin', 'putineveryday', 'zogolovok']

#dirpath_out = '/content/gdrive/MyDrive/vk_group_members'
dirpath_out = DIRPATH_ROOT_LOCAL

#### Mount gdrive

In [None]:
#drive.mount('/content/gdrive')

#### Get group members from VK or load them from gdrive

In [5]:
need_get_members_vk = False

# Get group members from vk
if need_get_members_vk:
  members = {}
  for group_id in group_idx:
    print(group_id)
    # Get from vk
    members[group_id] = vk.load_group_members(
        group_id, ntoload='all', offset=0, sort_type='id_desc', fields=None)
    # Save the result
    fname_out = f'{group_id}_members.pkl'
    fpath_out = os.path.join(dirpath_out, fname_out)
    with open(fpath_out, 'wb') as fid:
      pickle.dump(members[group_id], fid)

In [6]:
# Load group members
members = {}
for group_id in group_idx:
  print(group_id)
  fname_out = f'{group_id}_members.pkl'
  fpath_out = os.path.join(dirpath_out, fname_out)
  with open(fpath_out, 'rb') as fid:
    members_ = pickle.load(fid)
    members[group_id] = members_[group_id]  # Delete this line after reloading from vk

putin_z
vladimir_vladimirovichp
putin_govorit
putintoday
putin_vs_veteran
moskvaputinu
putin_2014
putin_lider
moy_putin
putineveryday
zogolovok


#### Functions for getting registration rate by user ID

In [7]:
import re
import subprocess
import datetime as dt

# Russian month names (genitive) -> month number
_RU_MONTH = {
    'января':1, 'февраля':2, 'марта':3, 'апреля':4, 'мая':5, 'июня':6,
    'июля':7, 'августа':8, 'сентября':9, 'октября':10, 'ноября':11, 'декабря':12
}

# Parse "Дата регистрации: 2 июля 2007" (optionally with "года")
_DATE_RE = re.compile(r'Дата регистрации:\s*([0-9]{1,2})\s+([А-Яа-я]+)\s+([0-9]{4})(?:\s*года)?')

# Current A record for regvk.com; update if it changes
_REGVK_IP = "81.177.139.247"

def _parse_reg_date_from_html(html: str):
    m = _DATE_RE.search(html)
    if not m:
        return None
    d = int(m.group(1))
    mon = m.group(2).lower()
    y = int(m.group(3))
    mm = _RU_MONTH.get(mon)
    if not mm:
        return None
    try:
        return dt.date(y, mm, d)
    except ValueError:
        return None

def _get_user_reg_date(user_id: int):
    """Get exact registration date from regvk.com. Returns datetime.date or None."""
    cmd = [
        "curl", "-s", "--compressed",
        "--resolve", f"regvk.com:443:{_REGVK_IP}",
        "https://regvk.com/",
        "--data-urlencode", f"link={user_id}",
        "--data-urlencode", "button=Определить дату регистрации",
    ]
    try:
        html = subprocess.check_output(cmd).decode("utf-8", "ignore")
    except Exception:
        return None
    return _parse_reg_date_from_html(html)

def get_user_reg_date(user_id: int):
    """Get user registration date; if unavailable, probe nearest IDs (±1..±4)."""
    # Try exact first
    d0 = _get_user_reg_date(user_id)
    if d0 is not None:
        return d0
    # Fallback: nearest numeric neighbors (like your old logic)
    for n in range(1, 5):
        for m in (1, -1):
            d = _get_user_reg_date(user_id + n * m)
            if d is not None:
                return d
    return None

# Test
#print('Test: ', get_user_reg_date(895252))

#### Main processing

In [8]:
nbins = 100
hbins = None
h = {}

# Calculate histograms of member indices
for group_id in group_idx:
  member_idx = np.array(members[group_id])
  member_idx = member_idx[member_idx > 6.8e8]
  if hbins is None:
    hh, hbins = np.histogram(member_idx, nbins)
  else:
    hh, _ = np.histogram(member_idx, hbins)
  hh = hh / np.sum(hh)
  h[group_id] = hh

  hh = hh / np.sum(hh)


In [9]:
need_get_reg_dates = False

if need_get_reg_dates:
  # Get registration dates for the user ID bins via an external service
  hbin_dates = []
  for hbin in tqdm(hbins):
    hbin_dates.append(get_user_reg_date(int(hbin)))

  # Try to fill None dates
  for n in range(1, len(hbin_dates) - 1):
    if hbin_dates[n] is None:
      dprev = hbin_dates[n - 1]
      dnext = hbin_dates[n + 1]
      if (dprev is not None) and (dnext is not None):
        hbin_dates[n] = dprev + (dnext - dprev) / 2

  # Store the dates to gdrive
  df = pd.DataFrame({'ID': hbins.astype(int), 'reg_date': hbin_dates})
  fpath_reg_dates = Path(dirpath_out) / f'hbin_reg_dates_{nbins}.csv'
  df.to_csv(fpath_reg_dates, index=False)

In [10]:
# Load registration dates for the user ID bins from gdrive
fpath_reg_dates = Path(dirpath_out) / f'hbin_reg_dates_{nbins}.csv'
df = pd.read_csv(fpath_reg_dates)
hbin_dates = df['reg_date'].to_numpy()
hbin_dates = np.array([datetime.date.fromisoformat(d) for d in hbin_dates])
#hbin_dates


In [11]:
def get_hbin_centers(hbin_vec):
  '''Convert bin edges to bin centers. '''
  N = len(hbin_vec)
  hbin_vec_c = np.ndarray((N), dtype=object)
  for n in range(0, N - 1):
    x1 = hbin_vec[n]
    x2 = hbin_vec[n + 1]
    if (x1 is None) or (x2 is None):
      hbin_vec_c[n] = None
    else:
      hbin_vec_c[n] = x1 + (x2 - x1) / 2
  return hbin_vec_c

def make_users_df(dates, users_data, group_idx_used):
  '''Convert users' data for various groups into a pandas DataFrame. '''
  data_lst = []
  col_names = []
  data_lst.append(dates)
  col_names.append('Date')
  for group_id in group_idx_used:
    data_lst.append(users_data[group_id])
    col_names.append(group_id)
  data_lst = list(map(list, zip(*data_lst)))  # transpose
  df = pd.DataFrame(data=data_lst, columns=col_names)
  return df

In [12]:
# Plotly config
px_config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 400,
    'width': 1000,
    'scale':4 # Multiply title/legend/axis/canvas sizes by this factor
  }
}

In [13]:
# Get centers of the time bins
hbin_dates_vis = get_hbin_centers(hbin_dates)[:-1]
#hbin_dates_vis = get_hbin_centers(hbins)[:-1]

# Groups to visualize
group_idx_vis = ['putin_z', 'vladimir_vladimirovichp', 'putin_govorit', 'putintoday',
                 'moskvaputinu', 'putin_lider', 'moy_putin', 'putineveryday', 'zogolovok']
#group_idx_vis = ['putin_vs_veteran', 'putin_2014']

# Create pandas DataFrame from the members' data
df = make_users_df(hbin_dates_vis, h, group_idx_vis)

# Subtract baseline
baseline = ['2021-11-01', '2022-02-01']
need_subtract_baseline = False
if need_subtract_baseline:
  baseline = [datetime.date.fromisoformat(d) for d in baseline]
  mask = (df.Date >= baseline[0]) & (df.Date <= baseline[1])
  baseline_data = df[mask].iloc[:, 1:].mean()
  df.iloc[:, 1:] -= baseline_data

# Time interval to visualize
dates_vis = ['2022-01-20', '2022-05-20']
dates_vis = [datetime.date.fromisoformat(d) for d in dates_vis]

title_str = 'Number of members by registration date  '
#title_str = 'Number of members by User ID  '
if need_subtract_baseline:
  title_str += f'(Baseline: {baseline[0]} - {baseline[1]})'
else:
  title_str += '(No baseline)'
fig = px.line(df, x='Date', y=df.columns[1:], title=title_str)
#fig.update_xaxes(range=dates_vis, row=1, col=1)
fig.show(config=px_config)


In [14]:
# Pre-/post-war periods
tlim_pre = ['2021-12-26', '2022-02-23']
tlim_post = ['2022-02-24', '2022-04-24']

tlim_pre = [datetime.date.fromisoformat(d) for d in tlim_pre]
tlim_post = [datetime.date.fromisoformat(d) for d in tlim_post]

mask_pre = (df.Date >= tlim_pre[0]) & (df.Date <= tlim_pre[1])
mask_post = (df.Date >= tlim_post[0]) & (df.Date <= tlim_post[1])

# Number of users in each group who registered in the pre-/post-war period
data_pre = df[mask_pre].iloc[:, 1:].mean()
data_post = df[mask_post].iloc[:, 1:].mean()

# Ratio of the users regitered in the post- and pre-war period
Q = data_post / data_pre
Q = Q.reset_index()
Q.columns = ['group_name', 'post/pre']
display(Q)

Unnamed: 0,group_name,post/pre
0,putin_z,2.950794
1,vladimir_vladimirovichp,3.415996
2,putin_govorit,2.34927
3,putintoday,3.693759
4,moskvaputinu,2.463181
5,putin_lider,2.815712
6,moy_putin,2.858974
7,putineveryday,1.697531
8,zogolovok,0.880663
