In [None]:
import snscrape as sntwitter

In [5]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np
import re
import yfinance as yf
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# --- CONFIGURATION ---
CRYPTO_TICKER = 'BTC-USD' # Your target coin
SEARCH_TERM = 'bitcoin OR #BTC OR $BTC' # Terms to scrape
START_DATE = '2024-01-01'
END_DATE = '2024-06-01'
MAX_TWEETS = 10000 # Max tweets to collect

# --- PHASE 1: Data Collection & Cleaning ---
print(f"1. Starting tweet collection for {CRYPTO_TICKER}...")
tweets_data = []
query = f"{SEARCH_TERM} since:{START_DATE} until:{END_DATE}"

# Scrape Tweets
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
    if i >= MAX_TWEETS:
        break
    # Get date (removing timezone info) and content
    tweets_data.append([
        tweet.date.tz_convert(None).date(),
        tweet.content
    ])

df_tweets = pd.DataFrame(tweets_data, columns=['Date', 'Tweet'])
print(f"   -> Scraped {len(df_tweets)} raw tweets.")

# Cleaning Function
def clean_tweet(text):
    text = re.sub('@[A-Za-z0-9_]+', '', text) # Mentions
    text = re.sub('RT[\s]+', '', text)       # Retweets
    text = re.sub('https?:\/\/\S+', '', text) # URLs
    text = re.sub('#', '', text)             # Hashtags
    text = re.sub('\n', ' ', text)           # Newlines
    return text.lower()

df_tweets['Clean_Tweet'] = df_tweets['Tweet'].apply(clean_tweet)
print("   -> Tweets cleaned successfully.")

# --- PHASE 2: Sentiment Analysis (VADER) & Aggregation ---
print("\n2. Running VADER sentiment analysis...")
sia = SentimentIntensityAnalyzer()
df_tweets['Compound'] = df_tweets['Clean_Tweet'].apply(lambda x: sia.polarity_scores(x)['compound'])
df_tweets['Sentiment'] = df_tweets['Compound'].apply(lambda score: 'Positive' if score >= 0.05 else ('Negative' if score <= -0.05 else 'Neutral'))

# Aggregate to Daily Average Score
df_sentiment_avg = df_tweets.groupby('Date')['Compound'].mean().reset_index()
df_sentiment_avg.columns = ['Date', 'Avg_Sentiment']

# Aggregate Positive/Negative Counts for Visualization
df_sentiment_counts = df_tweets.groupby(['Date', 'Sentiment']).size().unstack(fill_value=0).reset_index()

print("   -> Daily average sentiment calculated.")

# --- PHASE 3: Financial Data and Merging (Yahoo Finance) ---
print("\n3. Fetching crypto price data (yfinance)...")
df_price = yf.download(CRYPTO_TICKER, start=START_DATE, end=END_DATE)
df_price = df_price.reset_index()
df_price['Date'] = df_price['Date'].dt.date # Convert datetime index to match tweet date

# Merge price, average sentiment, and daily counts
df_final = pd.merge(df_price, df_sentiment_avg, on='Date', how='inner')
df_final = pd.merge(df_final, df_sentiment_counts, on='Date', how='left').fillna(0)

# Create Prediction Target (Signal: 1 for Buy/Up, 0 for Sell/Down)
df_final['Next_Day_Close'] = df_final['Close'].shift(-1)
df_final['Signal'] = np.where(df_final['Next_Day_Close'] > df_final['Close'], 1, 0)
df_final.dropna(inplace=True) # Drop last row (NaN for Next_Day_Close)

print(f"   -> Final merged dataset created: {len(df_final)} rows.")

# --- PHASE 4: Visualization & Conclusion ---
print("\n4. Generating visualizations and prediction conclusion.")

# Chart 1: Price vs. Sentiment Over Time 
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.lineplot(x='Date', y='Close', data=df_final, label=f'{CRYPTO_TICKER} Close Price')
ax2 = plt.twinx()
sns.lineplot(x='Date', y='Avg_Sentiment', data=df_final, ax=ax2, color='orange', label='Avg Sentiment')
ax2.set_ylabel("Average Sentiment Score")
plt.title(f'{CRYPTO_TICKER} Price vs. Daily Sentiment')

# Chart 2: Positive vs. Negative Tweets 
plt.subplot(1, 2, 2)
df_final[['Date', 'Positive', 'Negative']].set_index('Date').plot(kind='bar', ax=plt.gca(), figsize=(14, 6))
plt.title('Daily Positive vs. Negative Tweet Counts')
plt.ylabel("Tweet Count")
plt.tight_layout()
plt.show()


# --- PHASE 5: Regression Buy/Sell Signal (Sklearn) ---
print("\n5. Training Buy/Sell Prediction Model (Logistic Regression)...")

# Define Features (X) and Target (y)
features = ['Close', 'Volume', 'Avg_Sentiment', 'Positive', 'Negative']
X = df_final[features]
y = df_final['Signal']

# Split Data (Time series split: train on earlier data, test on later data)
# We use shuffle=False for time-series data
train_size = int(0.8 * len(df_final))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Train Logistic Regression Model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\n--- Model Results for Predicting Next Day's Direction ---")
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Confusion Matrix:\n{conf_matrix}")
print("\nConclusion: Sentiment is often a weak direct predictor, but its correlation with volume and price action can still yield interesting signals.")

1. Starting tweet collection for BTC-USD...


Error retrieving https://twitter.com/i/api/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline?variables=%7B%22rawQuery%22%3A%22bitcoin%20OR%20%23BTC%20OR%20%24BTC%20since%3A2024-01-01%20until%3A2024-06-01%22%2C%22count%22%3A20%2C%22product%22%3A%22Latest%22%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Afalse%2C%22blue_business_profile_image_shape_enabled%22%3Afalse%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22vibe_api_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is

ScraperException: 4 requests to https://twitter.com/i/api/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline?variables=%7B%22rawQuery%22%3A%22bitcoin%20OR%20%23BTC%20OR%20%24BTC%20since%3A2024-01-01%20until%3A2024-06-01%22%2C%22count%22%3A20%2C%22product%22%3A%22Latest%22%2C%22withDownvotePerspective%22%3Afalse%2C%22withReactionsMetadata%22%3Afalse%2C%22withReactionsPerspective%22%3Afalse%7D&features=%7B%22rweb_lists_timeline_redesign_enabled%22%3Afalse%2C%22blue_business_profile_image_shape_enabled%22%3Afalse%2C%22responsive_web_graphql_exclude_directive_enabled%22%3Atrue%2C%22verified_phone_label_enabled%22%3Afalse%2C%22creator_subscriptions_tweet_preview_api_enabled%22%3Afalse%2C%22responsive_web_graphql_timeline_navigation_enabled%22%3Atrue%2C%22responsive_web_graphql_skip_user_profile_image_extensions_enabled%22%3Afalse%2C%22tweetypie_unmention_optimization_enabled%22%3Atrue%2C%22vibe_api_enabled%22%3Atrue%2C%22responsive_web_edit_tweet_api_enabled%22%3Atrue%2C%22graphql_is_translatable_rweb_tweet_is_translatable_enabled%22%3Atrue%2C%22view_counts_everywhere_api_enabled%22%3Atrue%2C%22longform_notetweets_consumption_enabled%22%3Atrue%2C%22tweet_awards_web_tipping_enabled%22%3Afalse%2C%22freedom_of_speech_not_reach_fetch_enabled%22%3Afalse%2C%22standardized_nudges_misinfo%22%3Atrue%2C%22tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled%22%3Afalse%2C%22interactive_text_enabled%22%3Atrue%2C%22responsive_web_text_conversations_enabled%22%3Afalse%2C%22longform_notetweets_rich_text_read_enabled%22%3Afalse%2C%22longform_notetweets_inline_media_enabled%22%3Afalse%2C%22responsive_web_enhance_cards_enabled%22%3Afalse%2C%22responsive_web_twitter_blue_verified_badge_is_enabled%22%3Atrue%7D failed, giving up.