In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from pyvis.network import Network
import pandas as pd
from tqdm.auto import tqdm

import nltk
import pickle
import numpy as np
import pandas as pd
import gradio as gr
from bertopic import BERTopic
from nltk.corpus import stopwords
# from topic_modelling import dim_reduction_texts_topics

# from viz import (
#     viz_topic_bubbles,
#     viz_scatter_texts,
#     viz_word_scores,
#     viz_topic_heatmap,
#     viz_classes_corpus,
#     viz_classes_per_topic,
#     viz_ner_per_topic,
#     viz_n_grams_per_topic
# )


In [2]:
import gradio as gr


In [3]:
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP


def extract_topics(
        texts,
        embedder_name="all-MiniLM-L6-v2",
        n_neighbors=15,
        n_components=10,
        umap_metric='cosine',
        random_state=42,
        min_cluster_size=50,
        hdbscan_metric='euclidean'
):
    embedding_model = SentenceTransformer(embedder_name)
    embeddings = embedding_model.encode(texts)

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=n_components,
        min_dist=0.0,
        metric=umap_metric,
        random_state=random_state
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        metric=hdbscan_metric,
        cluster_selection_method='eom',
        prediction_data=True
    )

    representation_model = KeyBERTInspired()

    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        representation_model=representation_model,
        top_n_words=10,
        verbose=False
    )

    topics, _ = topic_model.fit_transform(texts, embeddings)

    return topics, topic_model, embeddings


def dim_reduction_texts_topics(textual_embeddings, topic_embeddings):
    umap_model = UMAP(
        n_neighbors=15,
        n_components=2,
        min_dist=0.0,
        metric='cosine',
        random_state=42
    )

    projected_texts = umap_model.fit_transform(textual_embeddings)
    projected_topics = umap_model.transform(topic_embeddings)

    return projected_texts, projected_topics


In [4]:
import itertools
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = set(stopwords.words('english'))


def join_docs(doc):
    return '\n\n'.join(doc)


def clean_doc(doc):
    doc = doc.lower()
    doc = re.sub('[^a-z A-Z 0-9-]+', '', doc)
    doc = " ".join([word for word in doc.split() if word not in stopwords])
    return doc


def viz_topic_bubbles(
        topic_model,
        projected_topics,
        texts
        ):

    x = projected_topics[:, :1]
    y = projected_topics[:, 1:]
    topic_freq = topic_model.get_topic_freq()
    doc_info = topic_model.get_document_info(texts)
    df = topic_freq.merge(doc_info, on='Topic', how='left')
    df = df.groupby(['Topic', 'Top_n_words', 'Count', 'Name']).agg({'Probability': 'mean'}).reset_index()
    df['x'] = x
    df['y'] = y

    fig = px.scatter(
        df,
        x='x',
        y='y',
        hover_data={
            "Topic": True,
            "Top_n_words": True,
            "Count": True,
            "x": False,
            "y": False
        },
        text='Topic',
        size='Count',
        color='Name',
        size_max=100,
        template='plotly_white',
    )

    fig.update_traces(marker=dict(line=dict(width=1, color='Gray')))

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig


def viz_scatter_texts(
        topic_model,
        texts,
        projected_texts
        ):

    topic_freq = topic_model.get_topic_freq()
    doc_info = topic_model.get_document_info(texts)
    df = topic_freq.merge(doc_info, on='Topic', how='left')
    x = projected_texts[:, :1]
    y = projected_texts[:, 1:]
    df['x'] = x
    df['y'] = y
    texts_c = df.groupby(['Topic']).agg({'Document': 'nunique'}).reset_index()
    texts_c = texts_c.rename(columns={'Document': 'Document_qty'})
    df = df.merge(texts_c, on='Topic', how='left')
    df.Document = df.Document.apply(lambda x: x[:100] + '...')

    fig = px.scatter(
        df,
        x='x',
        y='y',
        hover_data={
            "Topic": False,
            "Name": True,
            "Document": False,
            "Document_qty": False,
            "x": False,
            "y": False
        },
        hover_name='Document',
        color='Name',
        size_max=60,
        template='plotly_white',
    )

    fig.update_traces(marker=dict(line=dict(width=1, color='Gray')))
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)

    return fig


def viz_word_scores(
        topic_model,
        top_n_topics=8,
        n_words=5,
        custom_labels=False,
        title="<b>Вероятности слов по темам</b>",
        width=250,
        height=250
):

    colors = itertools.cycle(["#D55E00", "#0072B2", "#CC79A7", "#E69F00", "#56B4E9", "#009E73", "#F0E442"])

    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list()[0:6])

    if isinstance(custom_labels, str):
        subplot_titles = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topics]
        subplot_titles = ["_".join([label[0] for label in labels[:4]]) for labels in subplot_titles]
        subplot_titles = [label if len(label) < 30 else label[:27] + "..." for label in subplot_titles]
    elif topic_model.custom_labels_ is not None and custom_labels:
        subplot_titles = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topics]
    else:
        subplot_titles = [f"Тема {topic}" for topic in topics]
    columns = 4
    rows = int(np.ceil(len(topics) / columns))
    fig = make_subplots(
        rows=rows,
        cols=columns,
        shared_xaxes=False,
        horizontal_spacing=.1,
        vertical_spacing=.4 / rows if rows > 1 else 0,
        subplot_titles=subplot_titles
    )

    row = 1
    column = 1
    for topic in topics:
        words = [word + "  " for word, _ in topic_model.get_topic(topic)][:n_words][::-1]
        scores = [score for _, score in topic_model.get_topic(topic)][:n_words][::-1]

        fig.add_trace(
            go.Bar(x=scores,
                   y=words,
                   orientation='h',
                   marker_color=next(colors)),
            row=row, col=column)

        if column == columns:
            column = 1
            row += 1
        else:
            column += 1

    fig.update_layout(
        template="plotly_white",
        showlegend=False,
        title={
            'text': f"{title}",
            'x': .5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width * 4,
        height=height * rows if rows > 1 else height * 1.3,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    fig.update_xaxes(showgrid=True)
    fig.update_yaxes(showgrid=True)

    return fig


def viz_topic_heatmap(
        topic_model,
        topics=None,
        top_n_topics=None,
        n_clusters=None,
        custom_labels=False,
        title="<b>Матрица семантической близости тем</b>",
        width=800,
        height=800
):

    if topic_model.topic_embeddings_ is not None:
        embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]
    else:
        embeddings = topic_model.c_tf_idf_[topic_model._outliers:]

    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]

    if top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    sorted_topics = topics

    if n_clusters:
        distance_matrix = cosine_similarity(embeddings[topics])
        Z = linkage(distance_matrix, 'ward')
        clusters = fcluster(Z, t=n_clusters, criterion='maxclust')

        mapping = {cluster: [] for cluster in clusters}
        for topic, cluster in zip(topics, clusters):
            mapping[cluster].append(topic)
        mapping = [cluster for cluster in mapping.values()]
        sorted_topics = [topic for cluster in mapping for topic in cluster]

    indices = np.array([topics.index(topic) for topic in sorted_topics])
    embeddings = embeddings[indices]
    distance_matrix = cosine_similarity(embeddings)

    if isinstance(custom_labels, str):
        new_labels = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in
                      sorted_topics]
        new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
        new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]
    elif topic_model.custom_labels_ is not None and custom_labels:
        new_labels = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in sorted_topics]
    else:
        new_labels = [[[str(topic), None]] + topic_model.get_topic(topic) for topic in sorted_topics]
        new_labels = ["_".join([label[0] for label in labels[:4]]) for labels in new_labels]
        new_labels = [label if len(label) < 30 else label[:27] + "..." for label in new_labels]

    fig = px.imshow(
        distance_matrix,
        labels=dict(color="Оценка близости"),
        x=new_labels,
        y=new_labels,
        color_continuous_scale='GnBu'
    )

    fig.update_layout(
        title={
            'text': f"{title}",
            'y': .95,
            'x': 0.55,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black"
            )
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
    )

    fig.update_layout(showlegend=True)
    fig.update_layout(legend_title_text='Trend')

    return fig


def viz_classes_corpus(classes):
    df = pd.DataFrame({'classes': classes})
    df = df.value_counts().rename_axis('classes').reset_index(name='counts')
    fig = px.bar(df, x='classes', y='counts', color='classes')

    return fig


def viz_classes_per_topic(classes, topics,topic=1):

    df = pd.DataFrame({'classes': classes, 'topics': topics})
    df = df[df['topics'] == topic].drop(['topics'], axis=1)
    df = df.value_counts().rename_axis('classes').reset_index(name='counts')
    fig = px.bar(df, x='classes', y='counts', color='classes')

    return fig


def viz_ner_per_topic(ents, ner_topics, topic=1):
    df = pd.DataFrame({'ents': ents, 'topics': ner_topics})
    df = df[df['topics'] == topic]
    df.drop(['topics'], inplace=True, axis=1)
    df['ents'] = df['ents'].apply(lambda x: x.strip())
    df = df.value_counts().rename_axis('entity').reset_index(name='counts').head(10)
    fig = px.bar(df, x='entity', y='counts')

    return fig


def viz_n_grams_per_topic(texts, topic_model, topic=1, n=3):
    ngram_freq_df = pd.DataFrame()
    vectorizer = CountVectorizer(ngram_range=(n,n))
    df = topic_model.get_document_info(texts)
    df = df[df['Topic'] == topic]
    df['Document'] = df['Document'].apply(clean_doc)

    ngrams = vectorizer.fit_transform(df['Document'])
    count_values = ngrams.toarray().sum(axis=0)
    ngram_freq = pd.DataFrame(
        sorted([(count_values[i], k) for k, i in vectorizer.vocabulary_.items()],
        reverse=True),
        columns=["частота", "n-gram"]
        )

    ngram_freq_df = pd.concat([ngram_freq_df, ngram_freq])
    top_ngram = ngram_freq_df.sort_values(by='частота', ascending=False).head(10)

    fig = px.bar(
        top_ngram,
        x='частота',
        y='n-gram',
        orientation='h',
        title=f'Top-10 {n}-грамм для темы "{df.Name.iloc[0]}"'
        )

    return fig


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kirillanosov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")


In [6]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations


In [7]:
class KB():

    def __init__(self):
        self.entities = {}
        self.relations = []


    def merge_with_kb(self, kb2):
        for rel in kb2.relations:
            self.add_relation(rel)


    def are_relations_equal(self, rel1, rel2):
        return all(rel1[attr] == rel2[attr] for attr in ["head", "type", "tail"])


    def exists_relation(self, rel1):
        return any(self.are_relations_equal(rel1, rel2) for rel2 in self.relations)


    def merge_relations(self, rel2):
        rel1 = [rel for rel in self.relations if self.are_relations_equal(rel2, rel)][0]
        text = list(rel2["meta"].keys())[0]

        if text not in rel1["meta"]:
            rel1["meta"][text] = rel2["meta"][text]
        else:
            spans_to_add = [span for span in rel2["meta"][text]["spans"] if span not in rel1["meta"][text]["spans"]]
            rel1["meta"][text]["spans"] += spans_to_add


    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None


    def add_entity(self, ent):
        self.entities[ent["title"]] = {k:v for k,v in ent.items() if k != "title"}


    def add_relation(self, rel):
        candidate_entities = [rel["head"], rel["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        if any(ent is None for ent in entities):
            return

        for ent in entities:
            self.add_entity(ent)

        rel["head"] = entities[0]["title"]
        rel["tail"] = entities[1]["title"]

        if not self.exists_relation(rel):
            self.relations.append(rel)
        else:
            self.merge_relations(rel)


    def print(self):
        print("Entities:")
        for ent in self.entities.items():
            print(f"  {ent}")

        print("Relations:")
        for rel in self.relations:
            print(f"  {rel}")


In [8]:
def from_text_to_kb(text, span_length=128):
    inputs = tokenizer([text], return_tensors="pt")

    num_tokens = len(inputs["input_ids"][0])
    num_spans = math.ceil(num_tokens / span_length)
    overlap = math.ceil((num_spans * span_length - num_tokens) / max(num_spans - 1, 1))

    spans_boundaries = []
    start = 0

    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i, start + span_length * (i + 1)])
        start -= overlap

    tensor_ids = [
        inputs["input_ids"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries
        ]
    tensor_masks = [
        inputs["attention_mask"][0][boundary[0]:boundary[1]] for boundary in spans_boundaries
        ]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
        }
    num_return_sequences = 3

    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
        }

    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
        )

    decoded_preds = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=False
        )

    kb = KB()

    idx = 0
    for sentence_pred in decoded_preds:
        current_span_index = idx // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                text: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation)
        idx += 1

    return kb


In [9]:
def from_corpus_to_kb(corpus):
    kb = KB()
    for text in tqdm(corpus):
        try:
            kb_text = from_text_to_kb(text)
            kb.merge_with_kb(kb_text)
        except Exception as e:
            print(e)
    return kb


In [10]:
X = list(pd.read_csv('/Users/kirillanosov/Downloads/serendipity/data/bbc-news-data.csv', sep='\t')['content'].apply(str))


In [11]:
%%time
kb = from_corpus_to_kb(X[:2])


  0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 20.2 s, sys: 3.2 s, total: 23.4 s
Wall time: 2min 8s


In [12]:
kb

<__main__.KB at 0x106381b10>

In [None]:
import gradio
print(gradio.__file__)

In [14]:
net = Network(
    directed=True,
    width="700px",
    height="700px",
    bgcolor="#eeeeee"
    )

color_entity = "#00FF00"
for e in kb.entities:
    net.add_node(e, shape="circle", color=color_entity)

for rel in kb.relations:
    net.add_edge(
        rel["head"],
        rel["tail"],
        title=rel["type"],
        label=rel["type"]
        )

net.repulsion(
    node_distance=200,
    central_gravity=0.2,
    spring_length=200,
    spring_strength=0.05,
    damping=0.09
)
net.set_edge_smooth('dynamic')
net.show('./100.html', notebook=False)


./100.html
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
import gradio as gr
from pyvis.network import Network

def generate_html(kb_entities, kb_relations):
    net = Network(
        directed=True,
        width="700px",
        height="700px",
        bgcolor="#eeeeee"
    )

    color_entity = "#00FF00"
    for e in kb.entities:
        net.add_node(e, shape="circle", color=color_entity)

    for rel in kb.relations:
        net.add_edge(
            rel["head"],
            rel["tail"],
            title=rel["type"],
            label=rel["type"]
        )

    net.repulsion(
        node_distance=200,
        central_gravity=0.2,
        spring_length=200,
        spring_strength=0.05,
        damping=0.09
    )

    net.set_edge_smooth('dynamic')
    html = net.generate_html()
    html = html.replace("'", "\"")
    iframe_html = f"""<iframe style="width: 100%; height: 600px;margin:0 auto" name="result" allow="midi; geolocation; microphone; camera; display-capture; encrypted-media;" sandbox="allow-modals allow-forms allow-scripts allow-same-origin allow-popups allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
    return iframe_html

# Define your Gradio interface with direct component imports
demo = gr.Interface(
    fn=generate_html,
    inputs=[
        gr.Dataframe(headers=["Entities"]), 
        gr.Dataframe(headers=["Head", "Tail", "Type"])
    ],
    outputs="html",
    title="Network Visualization with Pyvis in Gradio",
    allow_flagging='never'
)

# Launch the Gradio app
demo.launch(share=True)


Running on local URL:  http://127.0.0.1:7862
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running on public URL: https://d22c2e2b07c5f58076.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [27]:
import gradio.outputs as outputs

ModuleNotFoundError: No module named 'gradio.outputs'

In [None]:
!conda info --envs

In [None]:
/Users/kirillanosov/opt/anaconda3/lib/python3.8/site-packages/gradio