<a href="https://colab.research.google.com/github/Jaimemorillo/electoral-prediction-madrid/blob/main/comunidades_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
pip install python-igraph



In [5]:
pip install leidenalg



In [6]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

import seaborn as sns
sns.set_theme()
sns.set_context("notebook")
import community
import igraph as ig
import leidenalg as la

# Leemos los datos

In [7]:
df = pd.read_csv('/content/gdrive/MyDrive/Tweets_elecciones/resultados_TFG_complete_1.csv', 
                 sep='~', encoding='utf-8', error_bad_lines=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
df['user_id'] = pd.to_numeric(df['user_id'], errors='coerce')
df = df.dropna(subset=['user_id'])
df['user_id'] = df['user_id'].astype(int)

df['followers_count'] = pd.to_numeric(df['followers_count'], errors='coerce')

In [9]:
df['created_at'] = df['created_at'].apply(lambda x: x[0:10])
df['account_created_at'] = df['account_created_at'].apply(lambda x: str(x)[0:10] if str(x)!='nan' else x)
df = df[df['created_at'].str.startswith('2021')]

In [10]:
def string_as_array(x):
  if x is not np.nan:
    return str(x).split(",")
  else:
    return np.nan

In [11]:
df.mentions_user_id = df.mentions_user_id.apply(lambda x: string_as_array(x))
df.mentions_screen_name = df.mentions_screen_name.apply(lambda x: string_as_array(x))
df.hashtags = df.hashtags.apply(lambda x: string_as_array(x))

In [12]:
# Calculamos las aristas y los nodos
df_temp = df[['screen_name','retweet_screen_name']]
df_temp = df_temp[df_temp['screen_name']!=df_temp['retweet_screen_name']]
df_edge_list = df_temp.copy()
df_edge_list = df_edge_list.groupby(['screen_name', 'retweet_screen_name'], as_index=False).size().rename(columns={'size': 'weight'})
nodes_no_conection = df_temp[pd.isnull(df_temp['retweet_screen_name'])] # ¿Qué hacemos con los tweets aislados?
df_edge_list['width'] = df_edge_list['weight']

#df_edge_list = df_edge_list.sample(2000, random_state=9)
#df_edge_list = df_edge_list.head(20000)

nodes_size = df_edge_list.groupby('retweet_screen_name', as_index=False).size()
nodes_size['size'] = MinMaxScaler(feature_range=(5,50)).fit_transform(nodes_size['size'].values.reshape(-1,1))

#df_edge_list = df_edge_list.join(nodes.set_index('retweet_screen_name'), on='retweet_screen_name')
#df_edge_list = df_edge_list.sort_values('size', ascending=False)

In [13]:
#Calculamos atributos de los nodos
df_temp1 = df[['screen_name','statuses_count','followers_count','friends_count',
               'created_at','favourites_count','account_created_at']]
df_temp2 = df[['retweet_screen_name','retweet_statuses_count',
               'retweet_followers_count','retweet_friends_count','created_at']]
df_temp2['favourites_count'] = np.nan
df_temp2['account_created_at'] = np.nan
df_temp2.columns = df_temp1.columns
df_temp = df_temp2.append(df_temp1).reset_index(drop=True).dropna(subset=['screen_name'])
df_temp = df_temp.dropna(subset=['followers_count'])
nodes = df_temp.drop_duplicates(subset=['screen_name'], keep='last').reset_index(drop=True)
nodes['days_old'] = (pd.to_datetime(nodes['created_at']) - pd.to_datetime(nodes['account_created_at'])).dt.days

nodes['statuses_count'] = nodes['statuses_count'].astype(int)
nodes['followers_count'] = nodes['followers_count'].astype(int)
nodes['friends_count'] = nodes['friends_count'].astype(int)
nodes['favourites_count'] = nodes['favourites_count'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [325]:
G = nx.from_pandas_edgelist(df_edge_list, 
                            source='retweet_screen_name',
                            target='screen_name',
                            edge_attr=['weight','width'],
                            create_using=nx.DiGraph())

nx.set_node_attributes(G, pd.Series(nodes_size.set_index('retweet_screen_name')['size']).to_dict(), 'size')
nx.set_node_attributes(G, pd.Series(nodes.set_index('screen_name')['statuses_count']).to_dict(), 'statuses_count')
nx.set_node_attributes(G, pd.Series(nodes.set_index('screen_name')['followers_count']).to_dict(), 'followers_count')
nx.set_node_attributes(G, pd.Series(nodes.set_index('screen_name')['friends_count']).to_dict(), 'friends_count')
nx.set_node_attributes(G, pd.Series(nodes.set_index('screen_name')['favourites_count']).to_dict(), 'favourites_count')
nx.set_node_attributes(G, pd.Series(nodes.set_index('screen_name')['days_old']).to_dict(), 'days_old')
# networkx.DiGraph.reverse()

In [326]:
vertex_df_ini = pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index')

tuples = [tuple(x) for x in df_edge_list[['retweet_screen_name','screen_name','weight']].values]
g = ig.Graph.TupleList(tuples, directed = True, edge_attrs = ['weight'])

vertex_df_ini = g.get_vertex_dataframe().join(vertex_df_ini, on='name')
g.vs["statuses_count"] = vertex_df_ini.statuses_count.values
g.vs["followers_count"] = vertex_df_ini.followers_count.values
g.vs["friends_count"] = vertex_df_ini.friends_count.values
g.vs["favourites_count"] = vertex_df_ini.favourites_count.values
g.vs["days_old"] = vertex_df_ini.days_old.values

# Comunidades

## Louvain

In [327]:
vertex_df = g.get_vertex_dataframe()

In [328]:
G_un = G.to_undirected()

In [329]:
partition = community.best_partition(G_un, random_state=10)

In [330]:
max(partition.values()) + 1

793

In [331]:
vertex_df = vertex_df.join(pd.Series(partition).rename('community_lou'), on='name')
nx.set_node_attributes(G, partition, 'community_lou')

In [332]:
vertex_df.community_lou.value_counts(normalize=True).head(8)

3     0.326982
2     0.238421
0     0.148968
4     0.059730
46    0.036041
6     0.032560
9     0.028774
12    0.018829
Name: community_lou, dtype: float64

In [312]:
vertex_df.community_lou.value_counts(normalize=True).head(8).sum()

0.8903053728441943

In [316]:
vertex_df[vertex_df['name']=='Tonicanto1']

Unnamed: 0_level_0,name,statuses_count,followers_count,friends_count,favourites_count,days_old,community_lou
vertex ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
67,Tonicanto1,76849,393446,3064,10613.0,3732.0,0


In [334]:
vertex_df[vertex_df['community_lou']==0].sort_values('followers_count', ascending=False).head(30)

Unnamed: 0_level_0,name,statuses_count,followers_count,friends_count,favourites_count,days_old,community_lou
vertex ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3862,EFEnoticias,261256,1909742,60,166.0,4112.0,0
155,europapress,441902,1401072,1277,3639.0,4067.0,0
73459,manuelvalls,17858,1052395,3132,3105.0,4258.0,0
30621,LucioQuincioC,1107231,869720,1216,192.0,3984.0,0
12271,informativost5,208735,795456,1246,241.0,4594.0,0
57966,colominaM,122750,736830,11638,494.0,3473.0,0
9905,OndaCero_es,171292,566023,794,378.0,3863.0,0
1586,elespanolcom,249543,430343,962,15513.0,2336.0,0
67,Tonicanto1,76849,393446,3064,10613.0,3732.0,0
55,GirautaOficial,36697,294705,1942,39327.0,2692.0,0


In [226]:
keys = vertex_df.community_lou.value_counts(normalize=True).head(6).keys().to_list()
nodes_to_delete = vertex_df[~vertex_df['community_lou'].isin(keys)].name.values
G.remove_nodes_from(nodes_to_delete)

In [227]:
""" Write to GEXF """
# Use 1.2draft so you do not get a deprecated warning in Gelphi
nx.write_gexf(G, "louvain.gexf", version="1.2draft")

## Leiden





In [277]:
partition = la.find_partition(g, la.ModularityVertexPartition, seed=96)

In [278]:
partition.summary()

'Clustering with 88482 elements and 863 clusters'

In [279]:
vertex_df['community_leid'] = partition.membership[:]

In [280]:
vertex_df.community_leid.value_counts(normalize=True).head(8)

0    0.328135
1    0.308978
2    0.073484
3    0.065335
4    0.037228
5    0.036595
6    0.034188
7    0.022276
Name: community_leid, dtype: float64

In [281]:
vertex_df.community_leid.value_counts(normalize=True).head(8).sum()

0.9062182138740084

In [290]:
vertex_df[vertex_df['community_leid']==7].sort_values('followers_count', ascending=False).head(20)

Unnamed: 0_level_0,name,statuses_count,followers_count,friends_count,favourites_count,days_old,community_lou,community_leid
vertex ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1202,InesArrimadas,7123,695472,1516,1356.0,3302.0,12,7
188,CiudadanosCs,176192,520879,89192,42859.0,4485.0,12,7
24973,diariosevilla,168799,205380,589,3736.0,4388.0,9,7
12871,begonavillacis,18347,192664,1958,,,12,7
3766,diariolaopinion,187647,139839,1698,4959.0,4041.0,1,7
838,lugaricano,22739,86215,413,12860.0,4071.0,12,7
3018,jordi_canyas,47772,83109,5407,3926.0,3915.0,12,7
2806,ignacioaguado,13002,72723,747,7575.0,4002.0,12,7
19214,Felisuco_,31302,59550,1397,7411.0,3756.0,12,7
20034,diariodeburgos,67456,57401,839,2655.0,4393.0,12,7


## Moore

In [335]:
g_un = g.copy()
g_un.to_undirected()

In [336]:
partition = g_un.community_fastgreedy()

In [337]:
partition.as_clustering().summary()

'Clustering with 88482 elements and 1187 clusters'

In [338]:
vertex_df['community_fast'] = partition.as_clustering().membership[:]

In [348]:
vertex_df.community_fast.value_counts(normalize=True).head(4)

1    0.349133
0    0.332056
6    0.068545
4    0.066047
Name: community_fast, dtype: float64

In [349]:
vertex_df.community_fast.value_counts(normalize=True).head(4).sum()

0.815781740919057

In [350]:
vertex_df[vertex_df['community_fast']==4].sort_values('followers_count', ascending=False).head(20)

Unnamed: 0_level_0,name,statuses_count,followers_count,friends_count,favourites_count,days_old,community_lou,community_fast
vertex ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5893,A3Noticias,323983,1985459,578,786.0,3994.0,6,4
12271,informativost5,208735,795456,1246,241.0,4594.0,0,4
15666,DiarioSUR,275490,267532,6529,17334.0,4460.0,0,4
31910,MadridCFyB,43432,238500,59021,650.0,4060.0,0,4
75079,mejoreszasca,18341,211512,475,18099.0,2249.0,0,4
7211,TheObjective_es,319301,80709,437,48913.0,2840.0,6,4
806,hazteoir,68176,56006,1476,24612.0,4662.0,2,4
29842,LibertadSurja,64258,53620,493,33596.0,3618.0,0,4
66210,Sr_Fonseca,11869,52068,1635,,,0,4
45,pons_sabate,4294,48633,1323,18726.0,706.0,0,4
