<a href="https://colab.research.google.com/github/Lednik7/visualise-audience-telegram/blob/main/visualise_audience_tg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dependency Upload

In [None]:
#@title Package Download
%%capture
!pip install telethon
!pip install transformers sentencepiece

In [None]:
#@title Imports

import torch
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA, TruncatedSVD
import pandas as pd
import json
from sklearn.cluster import DBSCAN, KMeans
import altair as alt
from IPython.display import clear_output

In [None]:
#@title Default Settings
#@markdown To gain access, go to the [website](https://my.telegram.org/auth?to=apps) and enter the relevant fields

%%writefile run.py
import configparser
import json

from telethon.tl.functions.users import GetFullUserRequest

from telethon.sync import TelegramClient
from telethon import connection
from tqdm import tqdm

# для корректного переноса времени сообщений в json
from datetime import date, datetime

# классы для работы с каналами
from telethon.tl.functions.channels import GetParticipantsRequest
from telethon.tl.types import ChannelParticipantsSearch

# класс для работы с сообщениями
from telethon.tl.functions.messages import GetHistoryRequest

# Считываем учетные данные
config = configparser.ConfigParser()
config.read("config.ini")

# Присваиваем значения внутренним переменным
api_id = 123 #@param {type:"integer"}
api_hash = "hash" #@param {type:"string"}
username = "worker"

client = TelegramClient(username, api_id, api_hash)

client.start()


async def users_details(channel):
    all_users_details = []  # список словарей с интересующими параметрами участников канала

    pbar = tqdm()
    async for participant in client.iter_participants(channel, aggressive=True):
        full = await client(GetFullUserRequest(participant))
        all_users_details.append({"id": participant.id,
                                "first_name": participant.first_name,
                                "last_name": participant.last_name,
                                "user": participant.username,
                                "phone": participant.phone,
                                "is_bot": participant.bot,
                                "about": full.about})
        pbar.set_description(full.about)
        pbar.update(1)

    with open('channel_users.json', 'w', encoding='utf8') as outfile:
        json.dump(all_users_details, outfile, ensure_ascii=False)


async def main():
    channel_link = "https://t.me/gradientdip" #@param {type:"string"}
    channel = await client.get_entity(channel_link)
    await users_details(channel)


with client:
    client.loop.run_until_complete(main())

In [None]:
!python run.py
clear_output()

## Chart Generation

In [None]:
#@title Vector Representation

with open("channel_users.json", "r") as read_file:
    channel_users = json.load(read_file)

descriptions = [i["about"] for i in channel_users if i["about"]]

%%capture
tokenizer = AutoTokenizer.from_pretrained("symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli")
model = AutoModel.from_pretrained("symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli")


def gen_batch(inputs, batch_size):
    batch_start = 0
    while batch_start < len(inputs):
        yield inputs[batch_start: batch_start + batch_size]
        batch_start += batch_size


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = t["attention_mask"].unsqueeze(-1).expand(token_embeddings.size()).float()
    return (torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)).cpu().numpy()


batches = tuple(gen_batch(descriptions, batch_size=32))
embeddings = []

for batch in tqdm(batches):
    embedding = mean_pooling(batch, model, tokenizer)
    embeddings.extend(embedding)

In [None]:
#@title Transform embeddings for 2D   

def get_data_frame(n_clusters: int = 4) -> pd.DataFrame:
    kmeans = KMeans(n_clusters=n_clusters,
                    random_state=0).fit(transformed)

    data = {"y": transformed[:, 0],
            "x": transformed[:, 1],
            "description": descriptions,
            "labels": kmeans.labels_}

    return pd.DataFrame(data)


pca = PCA(n_components=2, random_state=42)
transformed = pca.fit_transform(embeddings)

In [None]:
#@title Chart Settings

n_clusters = 4 #@param {type:"integer"}
mark_circle_size = 160 #@param {type:"integer"}
width = 720 #@param {type:"integer"}
height = 720 #@param {type:"integer"}

df = get_data_frame(n_clusters=n_clusters)

brush = alt.selection(type='interval', resolve='global')
rng = ['red', 'green', 'black']

alt.Chart(df).mark_circle(size=mark_circle_size).encode(
    x='x', y='y', color=alt.Color('labels', scale=alt.Scale(range=rng)),
    tooltip=['description']
).add_selection(
    brush
).properties(
    width=width,
    height=height
).interactive()