In [None]:
#!pip install git+https://github.com/JustAnotherArchivist/snscrape.git

In [15]:
import pandas as pd
import re
from datetime import datetime, timedelta
import snscrape.modules.twitter as sntwitter

####Limpieza

In [4]:
# Limpieza
def reduce_repeated_letters(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # URLs
    text = re.sub(r'\@\w+|\#','', text)  # menciones y hashtags
    text = re.sub(r'[^\w\s]', '', text)  # puntuación
    text = re.sub(r'\d+', '', text)  # números
    text = reduce_repeated_letters(text)  # letras repetidas
    text = re.sub(r'\s+', ' ', text).strip()  # espacios extra
    return text

In [5]:
# Palabras irrelevantes comunes
irrelevant = ['pregunta', 'ayuda', 'duda', 'consulta', 'urgente', 'sugerencia']

def is_irrelevant(texto):
    if not texto or len(str(texto).strip()) < 5:
        return True
    texto = str(texto).lower().strip()
    return texto in irrelevant or texto in [f'{p}.' for p in irrelevant]

# Comentarios relevantes en titulo/body
def comment(row):
    title = row['title'] if 'title' in row else ''
    body = row['body'] if 'body' in row else ''

    if is_irrelevant(title ) and not is_irrelevant(body):
        texto_final = body
    elif is_irrelevant(body) and not is_irrelevant(title ):
        texto_final = title
    elif is_irrelevant(title ) and is_irrelevant(body):
        texto_final = ''
    else:
        texto_final = f"{title.strip()} {body.strip()}"

    return clean_text(texto_final)


#### Reddit

https://www.kaggle.com/datasets/thedevastator/uncovering-user-perspectives-on-bitcoin-through?resource=download

In [7]:
# Archivo de kaggle
reddit_df = pd.read_csv('Bitcoin.csv')

In [8]:
# Comentarios completos y limpieza
reddit_df['comment'] = reddit_df.apply(comment, axis=1)
reddit_df['source'] = 'Reddit'

reddit_df.rename(columns={
    'timestamp': 'datetime',
    'score': 'likes',
    'comms_num': 'amountComments'
}, inplace=True)

# Conversión de fecha
reddit_df['datetime'] = pd.to_datetime(reddit_df['datetime'], errors='coerce')

# Selección de columnas
reddit_df = reddit_df[['datetime', 'source', 'comment', 'likes', 'amountComments']]

In [9]:
reddit_df.head()

Unnamed: 0,datetime,source,comment,likes,amountComments
0,2022-12-19 06:43:02,Reddit,can someone help explain to me how and why a l...,1,0
1,2022-12-19 06:19:25,Reddit,bitcoin is king once you understand bitcoin an...,2,4
2,2022-12-19 05:59:53,Reddit,the ftx disaster has set back crypto by years ...,0,9
3,2022-12-19 05:53:33,Reddit,will the loss of more bitcoins over time resul...,5,6
4,2022-12-19 05:45:47,Reddit,experts say sam bankmanfrieds best legal defen...,21,8


In [10]:
# Versión limpia
reddit_df.to_csv('reddit_df.csv', index=False)


In [14]:
print("Periodo de análisis:\n")
inicio = reddit_df['datetime'].min().date()
fin = reddit_df['datetime'].max().date()
print(f"Desde {inicio} hasta {fin}")

# Comentarios por día
reddit_df['date'] = reddit_df['datetime'].dt.date
resumen = reddit_df.groupby('date').size().reset_index(name='post_count') # Agrupa por fecha y cuenta los posts
print("Resumen de publicaciones por día en Reddit:")
print(resumen)


Periodo de análisis:

Desde 2022-12-08 hasta 2022-12-19
Resumen de publicaciones por día en Reddit:
          date  post_count
0   2022-12-08          38
1   2022-12-09          72
2   2022-12-10          71
3   2022-12-11          96
4   2022-12-12         116
5   2022-12-13         119
6   2022-12-14         123
7   2022-12-15          73
8   2022-12-16          97
9   2022-12-17          94
10  2022-12-18         201
11  2022-12-19         797


#### X

In [27]:
# Búsqueda con snscrape
def scrape_bitcoin_range(start_date, end_date, max_tweets_per_day=50):
    current = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    all_tweets = []

    while current < end:
        next_day = current + timedelta(days=1)
        query = f'Bitcoin since:{current.date()} until:{next_day.date()}'
        print(f"Scraping {query}")
        try:
            for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
                if i >= max_tweets_per_day:
                    break
                all_tweets.append({
                     'datetime': tweet.date,
                    'source': 'X',
                    'comment': clean_text(tweet.content),
                    'likes': tweet.likeCount,
                    'amountComments': tweet.retweetCount
                })
        except Exception as e:
            print(f"Error scraping {current.date()}: {e}")
        current = next_day

    x_df = pd.DataFrame(all_tweets)
    x_df.to_csv(f'x_df.csv', index=False)
    return x_df


In [28]:
scrape_bitcoin_range('2022-12-08', '2022-12-20')

Scraping Bitcoin since:2022-12-08 until:2022-12-09


Error retrieving https://twitter.com/i/api/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline?variables=%7B%22rawQuery%22%3A%22Bitcoin%20since%3A2022-12-08%20until%3A2022-12-09%22%2C%22count%22%3A20%2C%22product%22%3A%22Latest%22%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Afalse%2C%22blue_business_profile_image_shape_enabled%22%3Afalse%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22vibe_api_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3At

Error scraping 2022-12-08: 4 requests to https://twitter.com/i/api/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline?variables=%7B%22rawQuery%22%3A%22Bitcoin%20since%3A2022-12-08%20until%3A2022-12-09%22%2C%22count%22%3A20%2C%22product%22%3A%22Latest%22%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Afalse%2C%22blue_business_profile_image_shape_enabled%22%3Afalse%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22vibe_api_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_tra

KeyboardInterrupt: 