In [3]:
import gensim
import networkx as nx
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from bokeh.io import output_notebook
from ipysigma import Sigma
from scipy.spatial import ConvexHull
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP

from lib.constant import *
from lib.figures import *
from lib.utils import *

output_notebook(hide_banner=True)

seed = 42

In [4]:
df_deputy = pd.read_csv("data/nosdeputes.fr_deputes_en_mandat_2023-08-02.csv",sep=";")
slug2groupe = dict(df_deputy["slug groupe_sigle".split()].values)

In [5]:
dataset_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
dataset_df["is_keywords_label"] = dataset_df.is_keywords.apply(lambda x: "Oui" if x else "Non")
dataset_df = dataset_df.drop_duplicates("full_text")
dataset_df = dataset_df[~dataset_df.full_text.apply(lambda x : x.startswith("RT"))]
dataset_df =dataset_df[~(dataset_df.groupe_sigle == "GOV")]
dataset_df = dataset_df[dataset_df.is_keywords]
dataset_df.head()

Unnamed: 0,username,full_text,date,in_reply_to_screen_name,in_reply_to_status_id_str,in_reply_to_user_id_str,retweet_id,retweet_username,retweet_user_id,is_quote_status,quoted_status_id_str,groupe_sigle,hashtag,is_hashtag,lemmatization,keywords_detected,is_keywords,is_keywords_label
375,mathieu-lefevre,Plus les impôts baissent et plus les recettes ...,2023-02-01 06:32:02+00:00,,,,,,,False,,REN,[],False,plus le impôt baisser et plus le recette de l’...,"[travail, recette, courage, impôt]",True,Oui
682,frederic-boccaletti,"Mme , ""apparemment il y a une partie du foncti...",2023-02-01 06:50:58+00:00,,,,,,,False,,RN,[#motionreferendaire],True,"mme , "" apparemment il y avoir un partie de fo...",[pouvoir],True,Oui
488,philippe-brun,Le prix de l'énergie est un élément essentiel ...,2023-02-01 07:02:50+00:00,,,,,,,False,,SOC,[],False,le prix de le énergie être un élément essentie...,"[entreprise, compétitivité, patron]",True,Oui
326,kevin-mauvieux,Comprenez : « jamais nous ne défendrons la Fra...,2023-02-01 07:10:59+00:00,,,,,,,True,1.6204937755158282e+18,RN,[#NonALaReformeDesRetraites],True,Comprenez : « jamais nous ne défendre le Franc...,"[vote, retraite, retrait, ratio, français, Fra...",True,Oui
453,gregoire-de-fournas,Des centaines d'amendements de la NUPES ont ét...,2023-02-01 07:14:30+00:00,,,,,,,True,1.6204937755158282e+18,RN,[],False,un centaine de amendement de le NUPES avoir êt...,"[vote, ratio]",True,Oui


In [6]:
count_tweet = dict(dataset_df.groupby("username").size())
user2groupe = dict(dataset_df["username groupe_sigle".split()].values)
df_corpus = dataset_df.groupby("username",as_index=False).agg({"lemmatization":lambda x : " ".join(x)})
df_corpus["groupe_sigle"] = df_corpus["username"].map(slug2groupe)
corpus = df_corpus.apply(lambda doc: gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc.lemmatization), [doc.username]),axis=1)

In [7]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=32, min_count=2, epochs=40,dm=1)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
user_vec = model.dv.vectors
user_vec_reduced = UMAP(random_state=seed).fit_transform(user_vec)

In [9]:
fig = px.scatter(x=user_vec_reduced[:,0],y=user_vec_reduced[:,1],color=[user2groupe[user] for user in model.dv.index_to_key],size=[count_tweet[user] for user in model.dv.index_to_key],color_discrete_map=gp_politique_color,
           text= model.dv.index_to_key, height=1000,opacity=0.9,size_max=40)
fig

In [10]:
fig = px.density_contour(x=user_vec_reduced[:,0],y=user_vec_reduced[:,1],color=[user2groupe[user] for user in model.dv.index_to_key],color_discrete_map=gp_politique_color,height=700)

fig.show()

In [11]:
hulls = {}
for group,data in df_corpus.groupby("groupe_sigle"):
    deputies = data.username.values
    idx = np.isin(model.dv.index_to_key,deputies)
    points = user_vec_reduced[idx]
    try:
        hull = ConvexHull(points).vertices
        hulls[group] = points[hull]
    except:
        pass
fig =go.Figure()
for group in df_corpus.groupe_sigle.unique():
    try:
        fig.add_trace(go.Scatter(x=hulls[group][:,0],y=hulls[group][:,1], fill="toself",line=dict(width=0),fillcolor=gp_politique_color[group],opacity=0.4,hoveron=None,name=group,mode="lines"))
    except:
        pass
fig.update_layout(height=1000)
fig

In [12]:
adj_sim_matrix = cosine_similarity(user_vec)
adj_sim_matrix[adj_sim_matrix<0.75] = 0
adj_sim_matrix[np.identity(adj_sim_matrix.shape[0]).astype(bool)] = 0

In [13]:
G = nx.from_numpy_array(adj_sim_matrix)
for node in list(G.nodes()):
    if G.degree(node) <1:
        G.remove_node(node)

In [14]:
G= nx.relabel_nodes(G,{ix:label for ix,label in enumerate(model.dv.index_to_key) if ix in G})

In [15]:
betweeness = nx.betweenness_centrality(G,weight="weight")

In [16]:
size_func = lambda node:betweeness[node]
Sigma(G,
    node_size=size_func,#G.degree,
    node_color=slug2groupe,
    edge_size=lambda u,v:G.edges[u,v]["weight"],
    default_node_border_color="#ffffff",
    node_color_palette=gp_politique_color,
    node_size_range=[5,32],start_layout=5,
    height=1080,layout_settings=dict(adjustSize=True,linLogMode=False,barnesHutOptimize=False,scalingRatio =10,gravity=0.1,edgeWeightInfluence = 2),
    hide_info_panel=True,
    hide_search=True,node_label_size=size_func
    )

Sigma(nx.Graph with 377 nodes and 2,448 edges)