In [None]:
!pip install praw tweepy textblob vaderSentiment requests pandas scikit-learn



In [None]:
!pip install asyncpraw

Collecting asyncpraw
  Downloading asyncpraw-7.8.1-py3-none-any.whl.metadata (9.0 kB)
Collecting aiosqlite<=0.17.0 (from asyncpraw)
  Downloading aiosqlite-0.17.0-py3-none-any.whl.metadata (4.1 kB)
Collecting asyncprawcore<3,>=2.4 (from asyncpraw)
  Downloading asyncprawcore-2.4.0-py3-none-any.whl.metadata (5.5 kB)
Downloading asyncpraw-7.8.1-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading aiosqlite-0.17.0-py3-none-any.whl (15 kB)
Downloading asyncprawcore-2.4.0-py3-none-any.whl (19 kB)
Installing collected packages: aiosqlite, asyncprawcore, asyncpraw
Successfully installed aiosqlite-0.17.0 asyncpraw-7.8.1 asyncprawcore-2.4.0


In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import asyncpraw, tweepy, requests, datetime as dt

In [None]:
import pandas as pd
import requests
import time # Import the time module

def get_historical_prices_multi(coins, days=90):
    """
    coins: list of coin_ids like ['bitcoin', 'ethereum']
    returns: DataFrame with columns: coin, date, price, pct_change
    """
    all_data = []

    for coin_id in coins:
        url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart"
        params = {"vs_currency": "usd", "days": days}
        try:
            response = requests.get(url, params=params)
            response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
            data = response.json()

            if "prices" in data:
                df = pd.DataFrame(data["prices"], columns=["timestamp", "price"])
                df["date"] = pd.to_datetime(df["timestamp"], unit="ms").dt.date
                df = df.groupby("date")["price"].mean().reset_index()
                df["pct_change"] = df["price"].pct_change() * 100
                df["coin"] = coin_id
                all_data.append(df)
            else:
                print(f"Warning: 'prices' not found in data for {coin_id}. Skipping.")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {coin_id}: {e}")
        except ValueError:
            print(f"Error decoding JSON for {coin_id}. Skipping.")
        finally:
            time.sleep(1) # Add a 1 second delay between requests

    final_df = pd.concat(all_data, ignore_index=True)
    return final_df

In [None]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import asyncio
import asyncpraw
import nest_asyncio

# Allow async in Colab
nest_asyncio.apply()

# VADER + crypto lexicon
analyzer = SentimentIntensityAnalyzer()
crypto_lexicon = {"moon":3.0, "hodl":2.5, "pump":2.0, "dump":-2.0, "rekt":-3.0}
analyzer.lexicon.update(crypto_lexicon)

def analyze_sentiment(text):
    text = text.lower()
    blob = TextBlob(text)
    tb_polarity = blob.sentiment.polarity
    vader_scores = analyzer.polarity_scores(text)
    combined_score = (tb_polarity + vader_scores["compound"])/2
    return combined_score


In [None]:
# Initialize Reddit (replace with your credentials)
reddit = asyncpraw.Reddit(
   client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="YOUR_USER_AGENT"
)

async def fetch_reddit_posts(keyword, limit=500):
    posts = []
    subreddit = await reddit.subreddit("cryptocurrency")
    async for submission in subreddit.search(keyword, sort="new", limit=limit):
        posts.append({"title": submission.title, "text": submission.selftext, "date": pd.to_datetime(submission.created_utc, unit='s').date()})
    return posts



In [None]:
import requests

# Get a list of coins ranked by market cap
url = "https://api.coingecko.com/api/v3/coins/markets"
params = {
    "vs_currency": "usd",     # show prices in USD
    "order": "market_cap_desc",
    "per_page": 10,           # number of coins to fetch
    "page": 1,
    "sparkline": "false"
}

response = requests.get(url, params=params)
data = response.json()

# Extract coin names or symbols
major_cryptos = [coin["id"] for coin in data]
print(major_cryptos)

['bitcoin', 'ethereum', 'tether', 'ripple', 'binancecoin', 'solana', 'usd-coin', 'staked-ether', 'dogecoin', 'tron']


In [None]:
async def get_daily_sentiment(coin, limit=500):
    print(f"  Fetching Reddit posts for {coin} with limit {limit}...")
    posts = await fetch_reddit_posts(coin, limit)
    df = pd.DataFrame(posts)

    if df.empty:
        print(f"  No posts found for {coin}.")
        return pd.DataFrame(columns=["date","coin","avg_sentiment"])

    print(f"  {len(df)} posts fetched for {coin}. Analyzing sentiment...")
    df["sentiment"] = df["title"] + " " + df["text"]
    df["sentiment_score"] = df["sentiment"].apply(analyze_sentiment)

    daily_sentiment = df.groupby("date")["sentiment_score"].mean().reset_index()
    daily_sentiment["coin"] = coin
    daily_sentiment.rename(columns={"sentiment_score": "avg_sentiment"}, inplace=True)
    print(f"  Sentiment analysis complete for {coin}.")
    return daily_sentiment

In [None]:
async def build_final_dataframe(coins, days=365, reddit_limit=500):
    # Step 1: Historical prices
    price_df = get_historical_prices_multi(coins, days)

    # Step 2: Reddit sentiment
    sentiment_dfs = []
    for coin in coins:
        print(f"Fetching sentiment for {coin}...")
        daily_sent = await get_daily_sentiment(coin, reddit_limit)
        sentiment_dfs.append(daily_sent)
    sentiment_df = pd.concat(sentiment_dfs, ignore_index=True)

    print("\nSentiment DataFrame before merge:")
    display(sentiment_df.head()) # Display the head of the sentiment_df

    # Step 3: Merge
    final_df = pd.merge(price_df, sentiment_df, on=["coin","date"], how="left")

    return final_df

In [None]:
final_df = await build_final_dataframe(major_cryptos, days=365, reddit_limit=500)
print(final_df.head())

Error fetching data for binancecoin: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/binancecoin/market_chart?vs_currency=usd&days=365
Error fetching data for solana: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/solana/market_chart?vs_currency=usd&days=365
Error fetching data for usd-coin: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/usd-coin/market_chart?vs_currency=usd&days=365
Error fetching data for staked-ether: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/staked-ether/market_chart?vs_currency=usd&days=365
Error fetching data for dogecoin: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/dogecoin/market_chart?vs_currency=usd&days=365
Error fetching data for tron: 429 Client Error: Too Many Requests for url: https://api.coingecko.com/api/v3/coins/tron/market_chart?vs_currency=usd&days=365
Fetching

Unnamed: 0,date,avg_sentiment,coin
0,2025-10-03,0.159766,bitcoin
1,2025-10-04,0.117244,bitcoin
2,2025-10-05,0.225026,bitcoin
3,2025-10-06,0.086817,bitcoin
4,2025-10-07,0.141051,bitcoin


         date         price  pct_change     coin  avg_sentiment
0  2024-10-27  67018.165017         NaN  bitcoin            NaN
1  2024-10-28  67938.554129    1.373343  bitcoin            NaN
2  2024-10-29  69845.304531    2.806581  bitcoin            NaN
3  2024-10-30  72781.113082    4.203301  bitcoin            NaN
4  2024-10-31  72342.621711   -0.602480  bitcoin            NaN


In [None]:
final_df

Unnamed: 0,date,price,pct_change,coin,avg_sentiment
0,2024-10-27,67018.165017,,bitcoin,
1,2024-10-28,67938.554129,1.373343,bitcoin,
2,2024-10-29,69845.304531,2.806581,bitcoin,
3,2024-10-30,72781.113082,4.203301,bitcoin,
4,2024-10-31,72342.621711,-0.602480,bitcoin,
...,...,...,...,...,...
1455,2025-10-22,2.423593,-2.671522,ripple,
1456,2025-10-23,2.363125,-2.494980,ripple,0.572473
1457,2025-10-24,2.394205,1.315200,ripple,
1458,2025-10-25,2.504825,4.620345,ripple,


In [None]:
df_clean = final_df.dropna(subset=["avg_sentiment"]).reset_index(drop=True)
print(df_clean.head())

         date          price  pct_change     coin  avg_sentiment
0  2025-10-03  120611.719116    1.779255  bitcoin       0.159766
1  2025-10-04  122250.151868    1.358436  bitcoin       0.117244
2  2025-10-05  122380.937085    0.106982  bitcoin       0.225026
3  2025-10-06  123506.185200    0.919464  bitcoin       0.086817
4  2025-10-07  124773.508231    1.026121  bitcoin       0.141051


In [None]:
len(df_clean)

410

In [None]:
df_clean["target"] = (df_clean["pct_change"].shift(-1) > 0).astype(int)

# Drop the last row because it has no target
df_clean = df_clean[:-1]

print(df_clean.head())

         date          price  pct_change     coin  avg_sentiment  target
0  2025-10-03  120611.719116    1.779255  bitcoin       0.159766       1
1  2025-10-04  122250.151868    1.358436  bitcoin       0.117244       1
2  2025-10-05  122380.937085    0.106982  bitcoin       0.225026       1
3  2025-10-06  123506.185200    0.919464  bitcoin       0.086817       1
4  2025-10-07  124773.508231    1.026121  bitcoin       0.141051       0


In [None]:
X = df_clean[["price", "avg_sentiment", "pct_change"]]
y = df_clean["target"]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

model = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.06,
    random_state=10,
    use_label_encoder=False,
    eval_metric='logloss'
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6829268292682927
              precision    recall  f1-score   support

           0       0.71      0.60      0.65        20
           1       0.67      0.76      0.71        21

    accuracy                           0.68        41
   macro avg       0.69      0.68      0.68        41
weighted avg       0.69      0.68      0.68        41



In [None]:
# Predict probabilities instead of just labels
y_proba = model.predict_proba(X_test)

# y_proba[:, 1] gives probability of class 1 (price will rise)
confidence_up = y_proba[:, 1] * 100  # convert to percentage
confidence_down = y_proba[:, 0] * 100

# Example: show first 5 predictions
for i in range(5):
    label = "Up" if y_pred[i] == 1 else "Down"
    print(f"Predicted: {label}, Confidence Up: {confidence_up[i]:.2f}%, Confidence Down: {confidence_down[i]:.2f}%")


Predicted: Up, Confidence Up: 95.23%, Confidence Down: 4.77%
Predicted: Down, Confidence Up: 39.54%, Confidence Down: 60.46%
Predicted: Up, Confidence Up: 98.30%, Confidence Down: 1.70%
Predicted: Down, Confidence Up: 31.86%, Confidence Down: 68.14%
Predicted: Up, Confidence Up: 69.14%, Confidence Down: 30.86%


In [None]:
import pandas as pd

results = pd.DataFrame({
    "date": df_clean["date"].iloc[X_test.index],
    "coin": df_clean["coin"].iloc[X_test.index],
    "predicted_direction": ["Up" if p==1 else "Down" for p in y_pred],
    "confidence_up": confidence_up,
    "confidence_down": confidence_down
})

print(results.head())


           date    coin predicted_direction  confidence_up  confidence_down
368  2025-07-07  ripple                  Up      95.232140         4.767859
369  2025-07-08  ripple                Down      39.538757        60.461246
370  2025-07-11  ripple                  Up      98.295036         1.704967
371  2025-07-13  ripple                Down      31.857979        68.142021
372  2025-07-14  ripple                  Up      69.139046        30.860954


In [None]:
import joblib

# Suppose your trained model is `model`
joblib.dump(model, "crypto_sentiment_model.pkl")


['crypto_sentiment_model.pkl']