(file-types:notebooks)=
# Les parlementaires sur le réseau social Twitter

## Analyse des données textuelles 

In [73]:
import pandas as pd
from lib.figures import *
from lib.constant import *
from lib.utils import *

from bokeh.io import output_notebook
output_notebook(hide_banner=True)

seed = 42

twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')


In [74]:
intervention_frequency_per_group(twitter_df)

In [75]:
from bokeh.models import TabPanel, Tabs

df = getCountDataframe(twitter_df,top_n=10)

tab1 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 3]), title="3 mots")

show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))

In [76]:
from bokeh.models import TabPanel, Tabs

df = getCountDataframe(twitter_df,top_n=10)

tab1 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 3]), title="3 mots")

show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))

## Network Data

In [77]:

twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
twitter_df = twitter_df[~(twitter_df.retweet_id == None)]
twitter_df= twitter_df[twitter_df.is_keywords]
deputy_df = pd.read_csv("data/nosdeputes.fr_deputes_en_mandat_2023-08-02.csv",sep=";")
slug2twitterat = dict(deputy_df["slug twitter".split()].values)
twitter_df["twitter_at"] = twitter_df.username.map(slug2twitterat)
twitter_df.head(2)

Unnamed: 0,username,full_text,date,in_reply_to_screen_name,in_reply_to_status_id_str,in_reply_to_user_id_str,retweet_id,retweet_username,retweet_user_id,is_quote_status,quoted_status_id_str,groupe_sigle,hashtag,is_hashtag,lemmatization,keywords_detected,is_keywords,twitter_at
464,jean-luc-fugit,RT : La réforme des retraites soulève la quest...,2023-02-01 00:00:59+00:00,,,,1620569649967681542,StanGuerini,1911591212,False,,REN,[#retraites],True,rt : le réforme de retraite soulever le questi...,"[retraite, retrait, réforme, réforme de retrai...",True,Jean_LucFUGIT
453,laure-lavalette,"RT : . (RN) interpelle (LFI) : ""On ne comprend...",2023-02-01 00:19:24+00:00,,,,1620535420223213569,LCP,85362553,False,,RN,[],False,"rt : . ( RN ) interpelle ( LFI ) : "" on ne com...","[obstruction, majorité]",True,LaureLavalette


In [78]:
deputy_df["color"] = deputy_df.groupe_sigle.map(gp_politique_color)
dep2color = dict(deputy_df["twitter color".split()].values)
dep2sigle = dict(deputy_df["twitter groupe_sigle".split()].values)

In [79]:
def color(node):
    if not node in dep2color:
        return "#aaa"
    return dep2color[node]

def gp_legend(node):
    if not node in dep2sigle:
        return "NA"
    return dep2sigle[node]

In [80]:
import networkx as nx
graph_df = twitter_df["twitter_at retweet_username groupe_sigle".split()].astype(str)
graph_df = graph_df[~(graph_df.isna())]
#graph_df = graph_df[graph_df.retweet_username.isin(deputy_df.twitter.values)]
graph_df = graph_df.groupby("twitter_at retweet_username".split(),as_index=False).size()
G = nx.from_pandas_edgelist(graph_df,source="twitter_at",target="retweet_username",edge_attr="size",create_using=nx.DiGraph)
for node in list(G.nodes()):
    if G.degree(node)<4:
        G.remove_node(node)
if "None" in G: G.remove_node("None")

In [81]:
from ipysigma import SigmaGrid

In [82]:
betweeness = nx.betweenness_centrality(G)
page_rank = nx.pagerank(G)


In [83]:
SigmaGrid(G,hide_search=False,columns=2).add(node_size=G.in_degree,name="In Degree",
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=G.degree,
      node_size_range=[3,20],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[0.1,1])\
    .add(node_size=lambda x:betweeness[x],
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=lambda x:betweeness[x],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[1,5],node_size_range=[3,20],name="Betweeness")\
      .add(node_size=lambda x:page_rank[x],
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=lambda x:page_rank[x],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[1,5],node_size_range=[3,20],name="Page Rank")

VBox(children=(HBox(children=(Sigma(nx.DiGraph with 863 nodes and 8,761 edges), Sigma(nx.DiGraph with 863 node…

### Content similarity

In [84]:
dataset_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
dataset_df = dataset_df.drop_duplicates("full_text")
dataset_df = dataset_df[~dataset_df.full_text.apply(lambda x : x.startswith("RT"))]
dataset_df = dataset_df[dataset_df.is_keywords]
dataset_df.head()

Unnamed: 0,username,full_text,date,in_reply_to_screen_name,in_reply_to_status_id_str,in_reply_to_user_id_str,retweet_id,retweet_username,retweet_user_id,is_quote_status,quoted_status_id_str,groupe_sigle,hashtag,is_hashtag,lemmatization,keywords_detected,is_keywords
375,mathieu-lefevre,Plus les impôts baissent et plus les recettes ...,2023-02-01 06:32:02+00:00,,,,,,,False,,REN,[],False,plus le impôt baisser et plus le recette de l’...,"[travail, recette, courage, impôt]",True
682,frederic-boccaletti,"Mme , ""apparemment il y a une partie du foncti...",2023-02-01 06:50:58+00:00,,,,,,,False,,RN,[#motionreferendaire],True,"mme , "" apparemment il y avoir un partie de fo...",[pouvoir],True
488,philippe-brun,Le prix de l'énergie est un élément essentiel ...,2023-02-01 07:02:50+00:00,,,,,,,False,,SOC,[],False,le prix de le énergie être un élément essentie...,"[entreprise, compétitivité, patron]",True
326,kevin-mauvieux,Comprenez : « jamais nous ne défendrons la Fra...,2023-02-01 07:10:59+00:00,,,,,,,True,1.6204937755158282e+18,RN,[#NonALaReformeDesRetraites],True,Comprenez : « jamais nous ne défendre le Franc...,"[vote, retraite, retrait, ratio, français, Fra...",True
453,gregoire-de-fournas,Des centaines d'amendements de la NUPES ont ét...,2023-02-01 07:14:30+00:00,,,,,,,True,1.6204937755158282e+18,RN,[],False,un centaine de amendement de le NUPES avoir êt...,"[vote, ratio]",True


In [85]:
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from umap import UMAP
from ipysigma import Sigma

In [86]:
count_tweet = dict(dataset_df.groupby("username").size())
user2groupe = dict(dataset_df["username groupe_sigle".split()].values)

In [87]:
df_corpus = dataset_df.groupby("username",as_index=False).agg({"lemmatization":lambda x : " ".join(x)})
df_corpus["groupe_sigle"] = df_corpus["username"].map(dep2sigle)
corpus = df_corpus.apply(lambda doc: gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc.lemmatization), [doc.username]),axis=1)

In [88]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=64, min_count=2, epochs=40,dm=1)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [93]:
user_vec = model.dv.vectors
user_vec_reduced = UMAP(random_state=seed).fit_transform(user_vec)
# fig = px.scatter(x=user_vec_reduced[:,0],y=user_vec_reduced[:,1],color=[user2groupe[user] for user in model.dv.index_to_key],size=[count_tweet[user] for user in model.dv.index_to_key],color_discrete_map=gp_politique_color,
#            text= model.dv.index_to_key, height=1000,opacity=0.9,size_max=40)
# fig

In [None]:
G = nx.Graph()
for ix,node in enumerate(model.dv.index_to_key):
    G.add_node(node)
size_func = lambda x:count_tweet[x]
Sigma(graph=G,layout={node:{"x":user_vec_reduced[ix,0],"y":user_vec_reduced[ix,1]}for ix,node in enumerate(model.dv.index_to_key)},
      node_size=size_func,node_color_palette=gp_politique_color,
      node_color=user2groupe,default_node_border_color="#efefef",hide_search=True,
      node_label_size=size_func)