Héctor Asorey

# TFM: Impacto en Redes Sociales de ChatGPT

## Importar librerías

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

#nltk.download('stopwords')

## Leer datos

In [None]:
df = pd.read_csv('TwitterChatGPT.csv')

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
cantidad_na_por_columna = df.isna().sum()

# Mostrar los resultados
print(cantidad_na_por_columna)

In [None]:
df = df.dropna()

In [None]:
df.shape

## Cálculo métricas adicionales

### Número de emoticonos

In [None]:
import emoji

def countEmojis(text):
    cont = 0
    # iterate over each character in the text
    for char in text:
        # check if the character is an emoji
        if char in emoji.EMOJI_DATA:
            cont += 1
    return cont

In [None]:
df['number_emojis'] = list(map(countEmojis, df['content']))

### Número de símbolos

In [None]:
import string

def contar_simbolos_de_puntuacion(texto):
    puntuacion = string.punctuation + '¿' + '¡'
    return sum(1 for char in texto if char in puntuacion)


In [None]:
df['number_punctuation'] = list(map(countEmojis, df['content']))

### Número de tweets de cada usuario

In [None]:
df['num_tweets'] = (df['username'].value_counts()).loc[df['username']].values

### Menciones de cada tweet

In [None]:
def extraer_menciones(texto):
    mentions = re.findall(r'@\w+', texto)
    return mentions

In [None]:
df['mentions'] = df['content'].apply(extraer_menciones)
df['mentions'] = df['mentions'].apply(lambda mentions: [mention[1:] for mention in mentions])

### Número de menciones de cada tweet

In [None]:
df['num_mentions'] = df['content'].apply(lambda x: x.count('@'))

### Hashtags de cada tweet

In [None]:
def extraer_hashtags(texto):
    hashtags = re.findall(r'#\w+', texto)
    return hashtags

In [None]:
df['hashtags'] = df['content'].apply(extraer_hashtags)

### Número de hashtags de cada tweet

In [None]:
df['num_hashtags'] = df['hashtags'].apply(lambda x: len(x))

### Texto limpio de enlaces, emojis, hashtags, menciones... y eliminado de "Stopwords"

In [None]:
def limpieza_texto_tweets(text):

    text = text.lower()
    text = re.sub(r',', ' ', text)
    text = re.sub(r'ç', 'c', text)
    text = re.sub(r'ñ', 'n', text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r'á|é|í|ó|ú', lambda x: 'a' if x.group() == 'á' else 'e' if x.group() == 'é' else 'i' if x.group() == 'í' else 'o' if x.group() == 'ó' else 'u', text)
    text = re.sub(r'[^\x11-\x7F]+', ' ', text)
    text = re.sub(r'<.*>', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'\\x\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'_', '', text)
    text = re.sub(r'\t', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'rt', '', text)
    text = text.strip()
    
    return text

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

In [None]:
df['text'] = df['content'].apply(limpieza_texto_tweets)

In [None]:
df['text'] = df['text'].apply(remove_stopwords)

### Sentiment Analysis de cada tweet

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

In [None]:
def sentimentWithSpacy(x):
    doc = nlp(x)
    sentiment = doc._.blob.polarity
    sentiment = round(sentiment,2)

    if sentiment > 0:
      sent_label = 1 #Positive
    else:
      sent_label = 0 #Negative
    return sent_label

In [None]:
df['sentiment'] = df['text'].apply(sentimentWithSpacy)

In [None]:
df.head(10)

In [None]:
df.to_csv('prueba1.csv', index=False)

In [None]:
df2 = df[['username', 'mentions', 'hashtags']]

In [None]:
df_expanded = df2.explode('mentions').explode('hashtags').reset_index(drop=True)

In [None]:
df_subset = df_expanded.head(30000)

## Grafos con Python

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Sample DataFrame
data = {
    'user': ['user1', 'user2', 'user3'],
    'mentions': [['user2', 'user3'], ['user1'], ['user2']],
    'hashtags': [['#python', '#data'], ['#networking'], ['#data']],
}

dataframe = pd.DataFrame(data)

# Separate arrays into individual rows using explode
df_expanded = dataframe.explode('mentions').explode('hashtags').reset_index(drop=True)

# Create a directed graph
G = nx.MultiDiGraph()

# Add nodes and edges to the graph
for _, row in df_subset.iterrows():
    user = row['username']
    mention = row['mentions']
    hashtag = row['hashtags']

    # Add nodes
    G.add_node(user)
    G.add_node(mention)

    # Add edge with hashtag as an attribute
    G.add_edge(user, mention, hashtag=hashtag)

hub_score, _ = nx.hits(G)

# Imprimir el Hub Score para cada nodo
print("Hub Score:")
for node, score in hub_score.items():
    print(f"Nodo {node}: {score}")

nx.draw(G, with_labels = True) 

In [None]:
nx.write_graphml(G, "graph.graphml")

In [None]:
import community

partition = community.best_partition(G.to_undirected())

# Visualizar el grafo con colores de comunidad
pos = nx.spring_layout(G)

plt.figure(figsize=(12, 8))
nx.draw(G, pos, with_la bels=True, node_color=list(partition.values()), cmap=plt.cm.rainbow)
plt.show()

Se va a optar por utilizar la base de datos orientada a grafos Neo4J y la herramienta de visualización Gephi para todo el tema relacionado con éstos

## Grafos con Python y Neo4J

In [None]:
from py2neo import Graph, Node, Relationship

In [None]:
graph = Graph("bolt://localhost:7687", auth=("neo4j", "TwitterChatGPT"), name="twitterchatgpt")

In [None]:
from py2neo import Graph

# Execute the Cypher query
result = graph.run("MATCH (n) RETURN n;")

# Fetch the result
database_name = result.evaluate()

# Print or use the database name as needed
print(database_name)

### Grafo1 (Usuario -- Publica --> Tweets, Usuario -- Menciona --> Usuario)

In [None]:
from py2neo import Graph, Node
import pandas as pd

# Assuming 'graph' is your Neo4j graph instance and 'df' is your DataFrame

# Create unique User nodes
user_set = set(df['username'].tolist() + [mention for mentions in df['mentions'].dropna() for mention in mentions])
for user in user_set:
    user_node = Node("User", name=user)
    query = f"MERGE (u:User {{name: '{user}'}})"
    graph.run(query)

# Create Tweet nodes and relationships
for index, row in df.iterrows():
    user_node = Node("User", name=row['username'])
    tweet_node = Node("Tweet", text=row['text'])

    # Create relationship for user publishing the tweet
    query_publishes = f"""
    MERGE (u:User {{name: '{row['username']}'}})
    MERGE (t:Tweet {{text: '{row['text']}'}})
    MERGE (u)-[:PUBLISHES]->(t)
    """
    graph.run(query_publishes)

    # Create relationships for users mentioned in the tweet
    for mentioned_user in row['mentions']:
        query_mentions = f"""
        MERGE (u:User {{name: '{row['username']}'}})
        MERGE (mu:User {{name: '{mentioned_user}'}})
        MERGE (u)-[:MENTIONS]->(mu)
        """
        graph.run(query_mentions)


### Grafo 2 (Usuario -- Publica --> Tweets -- Menciona --> Usuarios)

In [None]:
dataframe_completo = df

In [None]:
df = dataframe_completo.head(10000)

In [None]:
#data = df
df = df.fillna('')

In [None]:
from py2neo import Graph, Node

#Nodos usuarios
for index, row in df.iterrows():
    user_node = Node("User", name=row['username'], tweet_count=row['num_tweets'])
    query = f"MERGE (u:User {{name: '{row['username']}', tweet_count: {row['num_tweets']}}})"
    graph.run(query)

# Nodos tweets
for index, row in df.iterrows():
    #print(index)
    tweet_node = Node("Tweet", text=row['text'], date=row['date'], like_count=row['like_count'], retweet_count=row['retweet_count'],
                      number_emojis = row['number_emojis'], number_punctuation=row['number_punctuation'], num_mentions=row['num_mentions'],
                      hashtags=row['hashtags'], num_hashtags=row['num_hashtags'], sentiment=row['sentiment'])

    # Relaciones de publicación
    query_publishes = f"""
    MATCH (u:User {{name: '{row['username']}', tweet_count: {row['num_tweets']}}})
    MERGE (t:Tweet {{text: '{row['text']}', date: '{row['date']}', like_count: {row['like_count']}, retweet_count: {row['retweet_count']},
                      number_emojis: {row['number_emojis']}, number_punctuation: {row['number_punctuation']}, num_mentions: {row['num_mentions']},
                      hashtags: '{','.join(row['hashtags'])}', num_hashtags: {row['num_hashtags']}, sentiment: '{row['sentiment']}'}})
    MERGE (u)-[:PUBLISHES]->(t)
    """
    graph.run(query_publishes)

    # Create relationships for users mentioned in the tweet
    for mentioned_user in row['mentions']:
        query_mentions = f"""
        MATCH (t:Tweet {{text: '{row['text']}'}})
        MATCH (mu:User {{name: '{mentioned_user}'}})
        MERGE (t)-[:MENTIONS]->(mu)
        """
        #print(query_mentions)
        graph.run(query_mentions)


In [None]:
query = """
MATCH (u1:User)-[:PUBLISHES]->(t:Tweet)-[:MENTIONS]->(u2:User)
MERGE (u1)-[m:MENTIONS_DIRECTLY]->(u2)
SET m.text = t.text, m.hashtags = t.hashtags
"""

graph.run(query)

### Añadir a Neo4J el sentimiento general de los usuarios

Leer el dataframe con todo (embeddings y sentimientos) y hacer las transformaciones necesarias para que sea valido

In [None]:
df = pd.read_csv('datasetConVaderV2.csv')

In [None]:
import ast

df["hashtags"] = df['hashtags'].apply(ast.literal_eval)
df["mentions"] = df['mentions'].apply(ast.literal_eval)
df = df.fillna('')


In [None]:
df.head(5)

In [None]:
df.shape

Función general

In [None]:
overall_sentiment = df.groupby('username')['sentiment'].mean().round().astype(int)

# Merge the overall sentiment back to the original DataFrame
df = pd.merge(df, overall_sentiment, how='left', on='username', suffixes=('', '_overall'))


In [None]:
def custom_round(x):
    if x >= 0.33:
        return 1
    elif x <= -0.33:
        return -1
    return 0

# Ejemplo de uso:
resultado = custom_round(-0.33)
resultado

Función para calcular el overall de los sentimientos con Vader si queremos que haya sentimiento "neutral"

In [None]:
overall_sentiment = df.groupby('username')['sentimentVaderWithNeutral'].mean().round().astype(int)

# Merge the overall sentiment back to the original DataFrame
df = pd.merge(df, overall_sentiment, how='left', on='username', suffixes=('', '_overallV2'))

Función para calcular el overall de los sentimientos con Vader personalizado con sentimiento neutral

Con esta función se fuerza a que el neutro sea realmente neutro

In [None]:
overall_sentiment = df.groupby('username')['sentimentVaderWithNeutral'].mean().apply(custom_round).astype(int)
#df = df.drop(columns=['sentimentVaderWithNeutral_overallV2'], axis=1)
df = pd.merge(df, overall_sentiment, how='left', on='username', suffixes=('', '_overallV2'))

In [None]:
data = df.head(100000) #50000 Probar con 100000 
#En la base de datos de Neo4J hay 100000 usuarios

In [None]:
data = data.tail(30000)

In [None]:
mentioned_users = data.apply(lambda row: set(row['mentions']).intersection(set(data['username'])), axis=1)
mentioned_users = mentioned_users[mentioned_users.apply(lambda x: len(x) > 0)].apply(lambda x: list(x)[0])

Función general

In [None]:

# Crear un nuevo DataFrame para almacenar los resultados
new_rows = []

# Iterar sobre los usuarios mencionados que no han publicado tweets
for user in mentioned_users:
    # Filtrar tweets donde el usuario ha sido mencionado
    user_mentions = data[data['mentions'].apply(lambda x: user in x)]

    # Calcular el sentiment promedio para el usuario basado en los tweets en los que ha sido mencionado
    overall_sentiment = user_mentions['sentiment'].mean().round()

    # Crear una nueva fila con la información calculada
    new_row = {'username': user, 'overall_sentiment': overall_sentiment}

    # Agregar la nueva fila al DataFrame de resultados
    new_rows.append(new_row)

# Crear un nuevo DataFrame con los resultados
result_df = pd.DataFrame(new_rows)

Función especializada para el caso de Vader con sentimiento "neutral"

In [None]:
#0.05
# Crear un nuevo DataFrame para almacenar los resultados
new_rows = []

# Iterar sobre los usuarios mencionados que no han publicado tweets
for user in mentioned_users:
    # Filtrar tweets donde el usuario ha sido mencionado
    user_mentions = data[data['mentions'].apply(lambda x: user in x)]

    if user_mentions['sentimentVaderWithNeutral'].mean() < -0.33:
        overall_sentiment = -1
    elif user_mentions['sentimentVaderWithNeutral'].mean() > 0.33:
        overall_sentiment = 1
    else:
        overall_sentiment = 0
        
    # Crear una nueva fila con la información calculada
    new_row = {'username': user, 'overall_sentiment': overall_sentiment}

    # Agregar la nueva fila al DataFrame de resultados
    new_rows.append(new_row)

# Crear un nuevo DataFrame con los resultados
result_df = pd.DataFrame(new_rows)

In [None]:
result_df

In [None]:
result_df['overall_sentiment'].value_counts()

In [None]:
data['sentimentVaderWithNeutral_overallV2'].value_counts()

In [None]:
data = df

Añadir sentimiento a los usuarios que publican

In [None]:
from py2neo import Graph, Node

# Assuming 'graph' is your Neo4j graph instance and 'df' is your DataFrame
# Assuming you have calculated overall_sentiment as described before

# Update User nodes with overall sentiment
for index, row in data.reset_index().iterrows():
    query_update_user = f"""
    MATCH (u:User {{name: '{row['username']}'}})
    SET u.overall_sentiment = {row['sentiment_overall']}
    """
    graph.run(query_update_user)


Añadir sentimiento Vader con "neutral" a los usuarios que publican

In [None]:
from py2neo import Graph, Node

# Assuming 'graph' is your Neo4j graph instance and 'df' is your DataFrame
# Assuming you have calculated overall_sentiment as described before

# Update User nodes with overall sentiment
for index, row in data.reset_index().iterrows():
    query_update_user = f"""
    MATCH (u:User {{name: '{row['username']}'}})
    SET u.overall_sentiment_vader_with_neutral = {row['sentimentVaderWithNeutral_overall']}
    """
    graph.run(query_update_user)


Añadir sentimiento a los usuarios mencionados que no publican

In [None]:
from py2neo import Graph, Node

# Assuming 'graph' is your Neo4j graph instance and 'df' is your DataFrame
# Assuming you have calculated overall_sentiment as described before

# Update User nodes with overall sentiment
for index, row in result_df.reset_index().iterrows():
    query_update_user = f"""
    MATCH (u:User {{name: '{row['username']}'}})
    SET u.overall_sentiment = {row['overall_sentiment']}
    """
    graph.run(query_update_user)


In [None]:
df.shape

Añadir sentimiento vader con "neutral" a los usuarios mencionados que no publican

In [None]:
from py2neo import Graph, Node

# Assuming 'graph' is your Neo4j graph instance and 'df' is your DataFrame
# Assuming you have calculated overall_sentiment as described before

# Update User nodes with overall sentiment
for index, row in result_df.reset_index().iterrows():
    query_update_user = f"""
    MATCH (u:User {{name: '{row['username']}'}})
    SET u.overall_sentiment_vader_with_neutral = {row['overall_sentiment']}
    """
    graph.run(query_update_user)


## Spacy en_core_web_lg

In [None]:
data = df

In [None]:
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe('spacytextblob')

def sentimentWithSpacyLarge(x):
    doc = nlp(x)
    sentiment = doc._.blob.polarity
    sentiment = round(sentiment,2)

    if sentiment > 0:
      sent_label = 1 #Positive
    else:
      sent_label = 0 #Negative
    return sent_label

In [None]:
data['sentimentLg'] = data['text'].apply(sentimentWithSpacyLarge)

In [None]:
data

In [None]:
data['different_sentiments'] = data['sentiment'] != data['sentimentLg']

different_sentiments_count = data['different_sentiments'].sum()

print(f"Number of rows with different sentiments: {different_sentiments_count}")
print(df)

Como parecen devolver los mismos valores, no actualizaremos la base de datos de Neo4J con los resultados del modelo "en_core_web_lg"

## VADER

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def sentiment_scores_with_neutral(sentence):
 
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
 
    if sentiment_dict['compound'] >= 0.05 :
        return 1
 
    elif sentiment_dict['compound'] <= - 0.05 :
        return -1
 
    else :
        return 0

In [None]:
def sentiment_scores_without_neutral(sentence):
 
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
 
    if sentiment_dict['compound'] >= 0 :
        return 1
    else:
        return 0

In [None]:
df["sentimentVaderWithNeutral"] = df['text'].apply(sentiment_scores_with_neutral)
df["sentimentVaderWithoutNeutral"] = df['text'].apply(sentiment_scores_without_neutral)

In [None]:
df.to_csv('datasetConVader.csv', index=False)

## Creación de embeddings

Lo primero es leer el dataset creado anteriormente si no está cargado en memoria

In [None]:
df = pd.read_csv('prueba1.csv')

import ast

df["hashtags"] = df['hashtags'].apply(ast.literal_eval)
df["mentions"] = df['mentions'].apply(ast.literal_eval)
df = df.fillna('')

df = df.head(100000)

Descarga del modelo

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
df['embeddings'] = df['text'].apply(model.encode)

In [None]:
df.to_csv('datasetConEmbeddings.csv', index=False)

## Exploración de los Embeddings

### DBSCAN

Debido al coste computacional, al igual que la parte de Neo4J, solo se ejecutará con los 100000 primeros registros...

In [None]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from matplotlib import pyplot as plt
import numpy as np

In [None]:
import ast

df = pd.read_csv('datasetConVader.csv')
df["hashtags"] = df['hashtags'].apply(ast.literal_eval)
df["mentions"] = df['mentions'].apply(ast.literal_eval)
df = df.fillna('')
df['embeddings'] = list(map(lambda x: [float(num) for num in x.strip('[]').split()], df['embeddings']))

In [None]:
data = df.head(100000)
del(df)

In [None]:
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data['embeddings'].to_list())

In [None]:
normalized_data

In [None]:
neighbors = NearestNeighbors(n_neighbors=25)
neighbors_fit = neighbors.fit(normalized_data)
distances, indices = neighbors_fit.kneighbors(normalized_data)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
eps = 20  
min_samples = 25 #Valores a utilizar: 50 
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(normalized_data)

In [None]:
data['DBSCAN_labels'] = labels

In [None]:
np.unique(labels, return_counts=True)

In [None]:
samples_per_label = 10

# Iterate over each unique value in the DBSCAN column
for label in data['DBSCAN_labels'].unique():
    # Filter DataFrame to include only samples with the current DBSCAN label
    filtered_df = data[data['DBSCAN_labels'] == label]
    
    # Display 20 text samples from the current DBSCAN label
    print(f"DBSCAN Label: {label}")
    print(filtered_df['content'].head(samples_per_label).values)
    print()

Eliminamos aquellos usuarios que parecen bots

In [None]:
filtered_df_by_labels = data[~data['DBSCAN_labels'].isin([1, 2, 4, 7])]

Volvemos a aplicar DBSCAN, habiendo quitado las comunidades de bots, pero esta vez con más características aparte de solo el contenido de tweet

In [None]:
filtered_df_by_labels_v2 = filtered_df_by_labels.drop(['DBSCAN_labels', 'content', 'date', 'id', 'username', 'mentions', 'hashtags', 'text'], axis = 1)

In [None]:
filtered_df_by_labels_v2

Separamos los embeddings en distintas columnas en vez de estar almacenados en una lista (esto es para poder aplicar los modelos)

In [None]:
new_df = filtered_df_by_labels['embeddings'].apply(pd.Series)
new_df.columns = ['embeddings{}'.format(x) for x in new_df.columns]

In [None]:
dataframe_to_analyze = pd.concat([filtered_df_by_labels_v2, new_df], axis=1)
del(new_df)
dataframe_to_analyze = dataframe_to_analyze.drop(['embeddings'], axis=1)

Normalizar el dataframe

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 3000))

# normalize the DataFrame
dataframe_to_analyze_norm = pd.DataFrame(scaler.fit_transform(dataframe_to_analyze), columns=dataframe_to_analyze.columns)

Si queremos utilizar PCA para hacer más sencilla la computación

In [None]:
from sklearn.decomposition import PCA

embedding_features = dataframe_to_analyze_norm.iloc[:, 10:]

pca = PCA(n_components=70) 
embedding_features_reduced = pca.fit_transform(embedding_features)

explained_variance_ratio = pca.explained_variance_ratio_

cumulative_variance = 0
for i, ratio in enumerate(explained_variance_ratio):
    cumulative_variance += ratio
    print(f"Variance explained by PC{i+1}: {ratio:.2%} (Cumulative: {cumulative_variance:.2%})")

In [None]:
import gc

del(dataframe_to_analyze)
del(filtered_df_by_labels_v2)

gc.collect()

In [None]:
pca_embedding_df = pd.DataFrame(embedding_features_reduced, columns=[f'PCA_{i}' for i in range(1, 71)])

dataframe_to_analyze_norm_reduced = dataframe_to_analyze_norm.drop(dataframe_to_analyze_norm.columns[10:], axis=1)
dataframe_reduced = pd.concat([dataframe_to_analyze_norm_reduced, pca_embedding_df], axis=1)

In [None]:
dataToAnalyze = dataframe_reduced.values

In [None]:
del dataframe_reduced

gc.collect()

Si queremos utilizar todos los embeddings

In [None]:
dataToAnalyze = dataframe_to_analyze_norm.values

Cambiamos de dataframe a lista de números (esto es porque con DataFrame falta memoria)

Aplicar DBSCAN

In [None]:
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(dataToAnalyze)
distances, indices = neighbors_fit.kneighbors(dataToAnalyze)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
eps = 6000  
min_samples = 50 #Valores a utilizar: 50 
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(dataToAnalyze)

In [None]:
np.unique(labels, return_counts=True)

En caso de que ya existan las etiquetas (porque se quieran ver las diferencias del DBSCAN entre usar PCA o no)

In [None]:
#Ejecutar la línea de debajo solo en caso de que se haya guardado una columna llamada DBSCAN_labels_after
#filtered_df_by_labels = filtered_df_by_labels.drop(['DBSCAN_labels_after'], axis = 1)
filtered_df_by_labels.loc[:, 'DBSCAN_labels_after'] = labels

Despues de eliminar los bots, miramos los grupos formados

In [None]:
samples_per_label = 20

# Iterate over each unique value in the DBSCAN column
for label in filtered_df_by_labels['DBSCAN_labels_after'].unique():
    # Filter DataFrame to include only samples with the current DBSCAN label
    filtered_df = filtered_df_by_labels[filtered_df_by_labels['DBSCAN_labels_after'] == label]
    
    # Display 20 text samples from the current DBSCAN label
    print(f"DBSCAN Label: {label}")
    print(filtered_df['content'].head(samples_per_label).values)
    print()

Sigue habiendo bots, por tanto, los volvemos a eliminar

Debemos fijarnos en los grupos que son de bots para ejecutar la línea siguiente

In [None]:
filtered_df_by_labels_v3 = filtered_df_by_labels[~filtered_df_by_labels['DBSCAN_labels_after'].isin([1, 2, 3, 4, 5, 6])]

In [None]:
del filtered_df_by_labels

gc.collect()

Volvemos a aplicar todos los pasos anteriores para aplicar DBSCAN, a ver si se han eliminado definitivamente los bots

In [None]:
filtered_df_by_labels_v4 = filtered_df_by_labels_v3.drop(['DBSCAN_labels', 'content', 'date', 'id', 'username', 'mentions', 'hashtags', 'text'], axis = 1)

new_df = filtered_df_by_labels_v3['embeddings'].apply(pd.Series)
new_df.columns = ['embeddings{}'.format(x) for x in new_df.columns]

dataframe_to_analyze = pd.concat([filtered_df_by_labels_v4, new_df], axis=1)
del(new_df)
dataframe_to_analyze = dataframe_to_analyze.drop(['embeddings'], axis=1)

dataframe_to_analyze_norm = pd.DataFrame(scaler.fit_transform(dataframe_to_analyze), columns=dataframe_to_analyze.columns)

dataToAnalyze = dataframe_to_analyze_norm.values

Si se quiere utilizar PCA, ejecutar la siguiente celda

In [None]:
from sklearn.decomposition import PCA

embedding_features = dataframe_to_analyze_norm.iloc[:, 10:]

pca = PCA(n_components=70) 
embedding_features_reduced = pca.fit_transform(embedding_features)

explained_variance_ratio = pca.explained_variance_ratio_

cumulative_variance = 0
for i, ratio in enumerate(explained_variance_ratio):
    cumulative_variance += ratio
    print(f"Variance explained by PC{i+1}: {ratio:.2%} (Cumulative: {cumulative_variance:.2%})")

In [None]:
neighbors = NearestNeighbors(n_neighbors=20)
neighbors_fit = neighbors.fit(dataToAnalyze)
distances, indices = neighbors_fit.kneighbors(dataToAnalyze)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)

In [None]:
eps = 6000  
min_samples = 50 #Valores a utilizar: 50 
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(dataToAnalyze)

#Ejecutar la línea de debajo solo en caso de que se haya guardado una columna llamada DBSCAN_labels_after
#filtered_df_by_labels_v3 = filtered_df_by_labels_v3.drop(['DBSCAN_labels_after_iter2'], axis = 1)
filtered_df_by_labels_v3.loc[:, 'DBSCAN_labels_after_iter2'] = labels

In [None]:
samples_per_label = 20

# Iterate over each unique value in the DBSCAN column
for label in filtered_df_by_labels_v3['DBSCAN_labels_after_iter2'].unique():
    # Filter DataFrame to include only samples with the current DBSCAN label
    filtered_df = filtered_df_by_labels_v3[filtered_df_by_labels_v3['DBSCAN_labels_after_iter2'] == label]
    
    # Display 20 text samples from the current DBSCAN label
    print(f"DBSCAN Label: {label}")
    print(filtered_df['content'].head(samples_per_label).values)
    print()

Prueba con OPTICS

In [None]:
from sklearn.cluster import OPTICS

optics_model = OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.05)

In [None]:
optics_model.fit(embedding_features_reduced)

Devuelve un único cluster

### Visualización con t-SNE

Aplicamos TSNE para una visualización de los contenidos publicados tras haber eliminado a los bots

In [None]:
labels = filtered_df_by_labels_v3['DBSCAN_labels_after_iter2']

Gráfico en dos dimensiones...

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=33)
X_tsne = tsne.fit_transform(dataToAnalyze)

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=33)
X_tsne = tsne.fit_transform(normalized_data)

In [None]:
colors = labels.astype(np.float)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, cmap='viridis', s = 10, alpha = 0.6)
plt.colorbar()
plt.rcParams["figure.figsize"] = (12, 9)
plt.figure(figsize=(18,15))
plt.show()

In [None]:
colors = labels.astype(np.float)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, cmap='viridis', s = 10, alpha = 0.6)
plt.colorbar()
plt.rcParams["figure.figsize"] = (12, 9)
plt.figure(figsize=(18,15))
plt.show()

Gráfico en tres dimensiones

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3, random_state=33)
X_tsne = tsne.fit_transform(dataToAnalyze)

Gráfico con matplotlib

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Assuming X_tsne contains your 3D data points and labels contains the color information

# Convert labels to float for color mapping
colors = labels.astype(np.float)

# Create a new figure
fig = plt.figure(figsize=(12, 9))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
scatter = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], X_tsne[:, 2], c=colors, cmap='viridis', s=10, alpha=0.6)

# Colorbar
plt.colorbar(scatter)

# Show the plot
plt.show()


Gráfico con plotly

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

# Assuming X_tsne contains your 3D data points and labels contains the color information

# Create the scatter plot trace
trace = go.Scatter3d(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    z=X_tsne[:, 2],
    mode='markers',
    marker=dict(
        size=5,  # Adjust the size as needed
        color=labels,
        colorscale='Viridis',
        opacity=0.6
    )
)

# Create the layout
layout = go.Layout(
    margin=dict(l=0, r=0, b=0, t=0),
)

# Create the figure
fig = go.Figure(data=[trace], layout=layout)

# Show the interactive plot
pio.show(fig)


## Exploración de los usuarios

El objetivo es agrupar a los usuarios en comunidades basándonos en sus características

### Preparación de los datos

Se debe crear primero un dataframe de usuarios a partir del dataframe procesado

In [None]:
import pandas as pd
import ast

df_temp = pd.read_csv('datasetConVaderV2.csv')

df_temp["hashtags"] = df_temp['hashtags'].apply(ast.literal_eval)
df_temp["mentions"] = df_temp['mentions'].apply(ast.literal_eval)
df_temp = df_temp.fillna('')


In [None]:
df_temp.head(5)

Creamos un dataframe de los usuarios con sus estadísticas numéricas

In [None]:
dfUsers = df_temp.groupby('username').agg(
                                          num_tweets=('num_tweets', 'first'),
                                          avg_likes=('like_count', 'mean'),
                                          avg_retweets=('retweet_count', 'mean'),
                                          avg_hashtags=('num_hashtags', 'mean'),
                                          avg_mentions=('num_mentions', 'mean'),
                                          avg_emojis=('number_emojis', 'mean'),
                                          avg_punctuation=('number_punctuation', 'mean'),
                                          avg_sentiment_vader=('sentimentVaderWithoutNeutral', 'mean'),
                                          avg_sentiment_vader_neutral=('sentimentVaderWithNeutral', 'mean'),
                                          avg_sentiment=('sentiment', 'mean')).reset_index()

Vemos cuantas veces ha sido mencionado cada usuario

In [None]:
df_aux = df_temp[['username', 'mentions']]

flattened_users = df_temp['mentions'].explode()

flattened_users
user_mentions_count = flattened_users.value_counts().reset_index()
user_mentions_count.columns = ['user', 'mention_count']
user_mentions_count

Añadimos esta información al dataframe con estadísticas de los usuarios

In [None]:
dfUsers = dfUsers.merge(user_mentions_count, left_on='username', right_on='user', how='left')
dfUsers.drop(columns=['user'], inplace=True)
dfUsers['mention_count'].fillna(0, inplace=True)

In [None]:
dfUsers.head(3)

### Crear comunidades de usuarios

Aplicar OPTICS con estos datos

(no se han logrado resultados con DBSCAN debido a que hay demasiados usuarios, es decir, por coste computacional)

In [None]:
dfOPTICS = dfUsers.drop_duplicates().drop(['username'], axis = 1).head(100000)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

dfOPTICSNorm = pd.DataFrame(scaler.fit_transform(dfOPTICS), columns=dfOPTICS.columns)
dataToAnalyze = dfOPTICSNorm.values

Modelo OPTICS con parámetros por defecto

In [None]:
from sklearn.cluster import OPTICS

#Min samples controla cuantos puntos tiene que tener alrededor otro punto para ser considerado central
#Xi controla los clusters formados, de tal manera que xi pequeño facilita la creación de clusters pequeños (aumenta la sensitividad a variaciones de densidad)
#Min cluster size representa que porcentaje de los datos frente al total de datos en el dataset son necesarios para formar un cluster
optics_model = OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.05)

In [None]:
optics_model.fit(dataToAnalyze)

<p> 50000 datos ----> 1 min 34 secs </p>
<p> 100000 datos ----> 6 min 54 secs </p>

Modelo OPTICS con parámetros propios

In [None]:
optics_model = OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.025)
optics_model.fit(dataToAnalyze)

In [None]:
import numpy as np

np.array(np.unique(optics_model.labels_, return_counts=True)).T

### Visualizar las comunidades de usuarios

In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=33)
X_tsne = tsne.fit_transform(dataToAnalyze)

In [None]:
import matplotlib.pyplot as plt

colors = optics_model.labels_.astype(np.float64)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, cmap='viridis', s = 10, alpha = 0.6)
plt.colorbar()
plt.rcParams["figure.figsize"] = (18, 15)
plt.figure(figsize=(18,15))
plt.show()

Gráfico interactivo en 3D

In [None]:
tsne = TSNE(n_components=3, random_state=33)
X_tsne = tsne.fit_transform(dataToAnalyze)

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

# Assuming X_tsne contains your 3D data points and labels contains the color information

# Create the scatter plot trace
trace = go.Scatter3d(
    x=X_tsne[:, 0],
    y=X_tsne[:, 1],
    z=X_tsne[:, 2],
    mode='markers',
    marker=dict(
        size=5,  # Adjust the size as needed
        color=optics_model.labels_.astype(np.float64),
        colorscale='Viridis',
        opacity=0.6
    )
)

# Create the layout
layout = go.Layout(
    margin=dict(l=0, r=0, b=0, t=0),
    width=1000,  # Adjust the width as needed
    height=800,
)

# Create the figure
fig = go.Figure(data=[trace], layout=layout)

# Show the interactive plot
pio.show(fig)

### Análisis de las comunidades creadas

In [None]:
dfUsersToAnalyze = dfUsers.drop_duplicates().head(100000)
dfUsersToAnalyze['labelsOPTICS'] = optics_model.labels_

In [None]:
averages = dfUsersToAnalyze.drop(['username'], axis=1).groupby('labelsOPTICS').mean()

# Print the averages
print("Averages of each metric for each label:")
print(averages)

In [None]:
cmap = plt.colormaps['tab10']
colors = [cmap(i/len(averages.index)) for i in range(len(averages.index))]

for col in averages.columns:
    plt.figure(figsize=(8, 6))
    for i, (label, values) in enumerate(averages.iterrows()):
        if values[col] == 0.0:
            plt.plot(label, values[col], marker='_', color=colors[i], markersize=40, linewidth=400)
        else:
            plt.bar(label, values[col], color=colors[i], label=label)
    plt.title(f'Average {col} for Each Label')
    plt.xlabel('Label')
    plt.ylabel('Average Value')
    plt.xticks(rotation=45)
    plt.grid(axis='y')
    plt.legend()
    plt.show()


### Análisis de ciertos usuarios

Se van a analizar algunos de los usuarios que hay en la base de datos de Neo4J, según la métrica HITS

Así, se van a mirar las métricas de los usuarios con mayor authority score y los usuarios con mayor hub score

In [None]:
dfAuthority = pd.read_csv('mostAuthorityUsersWithVader.csv')
dfHub = pd.read_csv('mostHubUsersWithVader.csv')

In [None]:
dfAuthorityAnalysis = dfUsersToAnalyze[dfUsersToAnalyze['username'].isin(dfAuthority['name'])]
means = dfAuthorityAnalysis.drop(['labelsOPTICS', 'username'], axis = 1).mean()
means

In [None]:
dfHubAnalysis = dfUsersToAnalyze[dfUsersToAnalyze['username'].isin(dfHub['name'])]
means = dfHubAnalysis.drop(['labelsOPTICS', 'username'], axis = 1).mean()
means

<hr>

## Analisis de los tweets

In [None]:
df_temporal = pd.read_csv('mostRelevantUsersWithVader.csv')

In [None]:
df_temporal['name']

In [None]:
import pandas as pd

dfSelectedUsers = df[df['username'].isin(df_temporal['name'])]

(En caso de querer utilizar todo el dataframe, no solo el específico para algunos usuarios, utilizar este otro bloque)

In [None]:
import pandas as pd

dfSelectedUsers = pd.read_csv('datasetConVaderV2.csv')

### Nubes de palabras

Nube de palabras del contenido de todos los tweets de estos usuarios (De los 100000 primeros datos, los que más conexiones tienen)

In [None]:
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsers['text'].astype(str)))

In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Transformaciones necesarias de la columna hashtags (debido a la lectura de csv, el array se transforma en string, y se debe revertir esto)

In [None]:
from ast import literal_eval

dfSelectedUsers['hashtags'] = dfSelectedUsers['hashtags'].apply(literal_eval)

In [None]:
all_hashtags = [hashtag for sublist in dfSelectedUsers['hashtags'] for hashtag in sublist]
hashtags_string = ' '.join(all_hashtags)

Nube de palabras de los hashtags de todos los tweets de estos usuarios (De los 100000 primeros datos, los que más conexiones tienen)

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(hashtags_string)

In [None]:
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

Ahora se va a hacer lo mismo, pero filtrando además por el sentimiento del tweet, para ver que diferencia de contenido hay entre tweets positivos y negativos

In [None]:
dfSelectedUsersPositive = dfSelectedUsers[dfSelectedUsers['sentiment'] == 1]
dfSelectedUsersNegative = dfSelectedUsers[dfSelectedUsers['sentiment'] == 0]

Lo mismo, pero en el caso de utilizar Vader con sentimiento neutro

In [None]:
dfSelectedUsersPositive = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral_overall'] == 1]
dfSelectedUsersNeutral = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral_overall'] == 0]
dfSelectedUsersNegative = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral_overall'] == -1]

(En caso de querer utilizar todo el dataframe, no solo el específico para algunos usuarios, utilizar este otro bloque)

In [None]:
dfSelectedUsersPositive = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral'] == 1]
dfSelectedUsersNeutral = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral'] == 0]
dfSelectedUsersNegative = dfSelectedUsers[dfSelectedUsers['sentimentVaderWithNeutral'] == -1]

Representar esas nubes de palabras en caso de solo sentimientos positivos y negativos

In [None]:
wordcloudPositiveTweets = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersPositive['text'].astype(str)))
wordcloudNegativeTweets = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNegative['text'].astype(str)))

wordcloudPositiveHashtags = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersPositive['hashtags'].astype(str)))
wordcloudNegativeHashtags = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNegative['hashtags'].astype(str)))

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

wordclouds = [
    (wordcloudPositiveTweets, 'Positive Tweets'),
    (wordcloudNegativeTweets, 'Negative Tweets'),
    (wordcloudPositiveHashtags, 'Positive Hashtags'),
    (wordcloudNegativeHashtags, 'Negative Hashtags')
]

for (ax, (wordcloud, title)) in zip(axes.flat, wordclouds):
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(title)
    ax.axis('off')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

Lo mismo, pero un gráfico tras otro (esto se utiliza para tener las imágenes en grande para el TFM)

In [None]:
import matplotlib.pyplot as plt

# Define the word clouds and their titles
wordclouds = [
    (wordcloudPositiveTweets, 'Positive Tweets'),
    (wordcloudNegativeTweets, 'Negative Tweets'),
    (wordcloudPositiveHashtags, 'Positive Hashtags'),
    (wordcloudNegativeHashtags, 'Negative Hashtags')
]

# Iterate over each word cloud and title
for wordcloud, title in wordclouds:
    # Create a new figure for each word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    
    # Show the current word cloud
    plt.show()


Lo mismo, pero en caso de utilizar sentimientos positivos, negativos y neutros

In [None]:
wordcloudPositiveTweets = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersPositive['text'].astype(str)))
wordcloudNegativeTweets = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNegative['text'].astype(str)))
wordcloudNeutralTweets = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNeutral['text'].astype(str)))


wordcloudPositiveHashtags = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersPositive['hashtags'].astype(str)))
wordcloudNegativeHashtags = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNegative['hashtags'].astype(str)))
wordcloudNeutralHashtags = WordCloud(width=800, height=400, background_color='white').generate(' '.join(dfSelectedUsersNeutral['hashtags'].astype(str)))

In [None]:

# Create a figure and axis for subplots
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 10))

# Define the word clouds and their titles
wordclouds = [
    (wordcloudPositiveTweets, 'Positive Tweets'),
    (wordcloudNeutralTweets, 'Neutral Tweets'),
    (wordcloudNegativeTweets, 'Negative Tweets'),
    (wordcloudPositiveHashtags, 'Positive Hashtags'),
    (wordcloudNeutralHashtags, 'Neutral Hashtags'),
    (wordcloudNegativeHashtags, 'Negative Hashtags')
]

# Iterate over each subplot and corresponding word cloud with title
for (ax, (wordcloud, title)) in zip(axes.flat, wordclouds):
    # Plot the word cloud on the current subplot
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.set_title(title)
    ax.axis('off')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

Lo mismo, pero un gráfico tras otro (esto se utiliza para tener las imágenes en grande para el TFM)

In [None]:
import matplotlib.pyplot as plt

wordclouds = [
    (wordcloudPositiveTweets, 'Positive Tweets'),
    (wordcloudNeutralTweets, 'Neutral Tweets'),
    (wordcloudNegativeTweets, 'Negative Tweets'),
    (wordcloudPositiveHashtags, 'Positive Hashtags'),
    (wordcloudNeutralHashtags, 'Neutral Hashtags'),
    (wordcloudNegativeHashtags, 'Negative Hashtags')
]


for wordcloud, title in wordclouds:
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()


### Sentimientos de los tweets a lo largo del tiempo

In [None]:
import pandas as pd

df = pd.read_csv('datasetConVaderV2.csv')

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date_formatted'] = df['date'].dt.strftime('%Y-%m-%d')

In [None]:
daily_counts = df.groupby(['date', 'sentiment']).size().unstack(fill_value=0)

# Plot the line plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(daily_counts.index, daily_counts[1], label='Positive Sentiment', color='green')
ax.plot(daily_counts.index, daily_counts[0], label='Negative Sentiment', color='red')

# Customize plot
ax.set_xlabel('Date')
ax.set_ylabel('Count')
ax.set_title('Count of Positive and Negative Tweets Over Time')
ax.legend()

plt.xticks(rotation=45)

# Show plot
plt.tight_layout()
plt.show()

1-02-2023 ----> 100 millones de usuarios
https://www.reuters.com/technology/chatgpt-sets-record-fastest-growing-user-base-analyst-note-2023-02-01/

https://www.elmundo.es/tecnologia/2023/02/06/63e16e55fc6c83815e8b45bb.html
https://www.semana.com/tecnologia/articulo/chatgpt-no-aguanto-al-voltaje-y-experimenta-nueva-caida-a-nivel-mundial/202325/
https://www.reuters.com/technology/chatgpts-popularity-explodes-us-lawmakers-take-an-interest-2023-02-13/


15-02-2023 ----> Elon Musk negative claims of chatgpt
https://www.cnbc.com/2023/02/15/elon-musk-co-founder-of-chatgpt-creator-openai-warns-of-ai-society-risk.html

Segunda semana de Marzo ----> lanzamiento de GPT-4

Finales de Marzo

https://efe.com/ciencia-y-tecnologia/2023-03-30/denuncian-en-estados-unidos-el-chatgpt-de-openai-y-piden-que-sea-suspendido-tras-recientes-reservas-sobre-ia/