<a href="https://colab.research.google.com/github/LorenzoKaufmann/crypto_forecast/blob/main/Sentiment_Analysis_VADER_weekly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import

In [12]:
!pip install dash

Collecting dash
  Downloading dash-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Downloading dash-3.2.0-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.4.2-py3-none-any.whl (10 kB)
Installing collected packages: retrying, dash
Successfully installed dash-3.2.0 retrying-1.4.2


In [1]:
from datetime import datetime, timedelta
from textblob import TextBlob
import pandas as pd
import spacy
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import os
import matplotlib.ticker as mticker
import matplotlib.dates as mdates

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("vader_lexicon")
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [21]:
from google.colab import drive
drive.mount('/drive')
folder_path = '/drive/My Drive/Colab Notebooks/Crypto Project/Data/reddit_posts'
folder_path_daily_update = '/drive/My Drive/Colab Notebooks/Crypto Project/Data/reddit_posts/Daily_Update'
test_case = False

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


###read JSON File

In [22]:
def read_json_data(folder_path, get_function='all'):
  days_list = [f[17:-5] for f in os.listdir(folder_path)
    if os.path.isfile(os.path.join(folder_path, f))]
  print(f'List of Days: {days_list}')
  all_posts = pd.DataFrame(columns=['index', 'title', 'score', 'text', 'author', 'url',
                                    'num_comments', 'subreddit', 'fetch_method'])
  for day in days_list:
    filename = f'reddit_posts_{get_function}_{day}.json'
    posts_df = pd.read_json(os.path.join(folder_path, filename),  orient='index')
    posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'], unit='s')
    posts_df = posts_df.reset_index().set_index('created_utc')
    if all_posts.empty:
      all_posts = posts_df
    else:
      all_posts = pd.concat([all_posts, posts_df])
  all_posts = all_posts.drop_duplicates(subset='index', keep='first')
  return all_posts


def text_analysis_VADER(row, colname, sia):
    # print(text)
    text = row[colname]
    sentiment = sia.polarity_scores(text)["compound"]  # Analyze sentiment
    return sentiment



In [23]:
all_reddit_posts = read_json_data(folder_path_daily_update, get_function='all')

sia = SentimentIntensityAnalyzer()

all_reddit_posts['sent_analysis_text_VADER'] = all_reddit_posts.apply(text_analysis_VADER, args=('text', sia), axis=1)
all_reddit_posts['week_year'] = all_reddit_posts.index.strftime('%Y-%W')

weekly_df = (
    all_reddit_posts
    .groupby('week_year', as_index=False)
    .agg(vader_mean=('sent_analysis_text_VADER', 'mean'))
)
print(weekly_df.head())

List of Days: ['2024_11_22', '2024_11_23', '2024_11_25', '2024_11_26', '2024_11_27', '2024_11_29', '2024_12_03', '2024_12_04', '2024_12_11', '2024_12_16', '2025_07_20', '2025_08_07', '2025_08_16']
  week_year  vader_mean
0   2014-00    0.000000
1   2017-36    0.991500
2   2017-38   -0.976600
3   2017-40    0.977400
4   2017-43   -0.217525


####load crypto data

In [24]:
cryptocurrency_release_dates = {
    "Bitcoin (BTC)": "2012-01-01",
    "Ethereum (ETH)": "2015-07-30",
    "Tether (USDT)": "2014-10-06",
    "Binance Coin (BNB)": "2017-07-25",
    "USD Coin (USDC)": "2018-09-26",
    "XRP (Ripple)": "2012-01-01",
    "Solana (SOL)": "2020-03-01",
    "Cardano (ADA)": "2017-09-29",
    "Dogecoin (DOGE)": "2013-12-06",
    "Toncoin (TON)": "2021-08-01",
    "Polkadot (DOT)": "2020-05-26",
    "Litecoin (LTC)": "2012-01-01",
    "Chainlink (LINK)": "2017-09-19",
    "Uniswap (UNI)": "2020-09-16",
    "Bitcoin Cash (BCH)": "2017-08-01",
    "Stellar (XLM)": "2014-07-31",
    "VeChain (VET)": "2018-08-01",
    "TRON (TRX)": "2017-09-13",
    "Filecoin (FIL)": "2020-10-15",
    "Aave (AAVE)": "2020-10-01"
}

In [25]:
folder_path_crypto = '/drive/My Drive/Colab Notebooks/Crypto Project/Data/cryptocompare'

coin_dict = {}
for coin, release_date in cryptocurrency_release_dates.items():
  coin_name = coin.split(' (')[0]
  coin_symbol = coin.split(' (')[1][:-1]
  release_date = pd.to_datetime(release_date)
  try:
    currency_df = pd.read_csv(f'{folder_path_crypto}/{coin_symbol}_file.csv', parse_dates=['time'], index_col=0, date_format='%Y-%m-%d')
    currency_df = currency_df[['time', 'high', 'low', 'open', 'volumefrom', 'volumeto', 'close']]

  except FileNotFoundError:
    print(f"No file found for {coin_symbol}")
    continue

  coin_dict[coin_name] = currency_df

No file found for Ripple


In [26]:
def plot_crypto_sentiment(df, crypto_currency):
    """
    Plot crypto close price vs. sentiment analysis over time.

    Parameters:
    -----------
    df : pandas.DataFrame
        Must contain:
        - 'time' (datetime)
        - 'close' (numeric)
        - 'moving_avg_vader_mean' (numeric, 0–1 scale)
    crypto_currency : str
        Name of the cryptocurrency for title labeling.
    """

    plt.style.use("seaborn-v0_8-darkgrid")

    fig, ax1 = plt.subplots(figsize=(15, 7))

    price_color = "#008080"   # deep teal
    sent_color  = "#FF7F0E"   # vivid orange

    # Left axis — Close Price
    line1, = ax1.plot(
        df['time'],
        df['close'],
        label='Close Price',
        color=price_color,
        linewidth=2.5,
        alpha=0.9
    )
    ax1.fill_between(
        df['time'],
        df['close'],
        alpha=0.1,
        color=price_color
    )
    ax1.set_ylabel('Close Price (USD)', color=price_color, fontsize=13)
    ax1.tick_params(axis='y', labelcolor=price_color)
    ax1.yaxis.set_major_formatter(mticker.StrMethodFormatter('$ {x:,.0f}'))

    # Right axis — VADER Sentiment
    ax2 = ax1.twinx()
    line2, = ax2.plot(
        df['time'],
        df['moving_avg_vader_mean'],
        label='Sentiment Analysis',
        color=sent_color,
        linewidth=2.5,
        linestyle='--',
        alpha=0.9,
        zorder=5
    )
    ax2.set_ylabel('Sentiment Analysis', color=sent_color, fontsize=13)
    ax2.tick_params(axis='y', labelcolor=sent_color)
    ax2.yaxis.set_major_formatter(mticker.PercentFormatter(1.0))

    # Ensure sentiment is visible
    ax2.set_ylim(
        df['moving_avg_vader_mean'].min() - 0.05,
        df['moving_avg_vader_mean'].max() + 0.05
    )

    ax1.xaxis.set_major_locator(mdates.YearLocator())
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
    ax1.xaxis.set_minor_locator(mdates.MonthLocator())
    ax1.xaxis.set_minor_formatter(mdates.DateFormatter('%b'))

    # Tick styles
    ax1.tick_params(axis='x', which='major', length=10, width=1.2, pad=20)  # move years lower
    ax1.tick_params(axis='x', which='minor', length=5, width=0.8, labelsize=8, pad=2)

    # Styling labels
    plt.setp(ax1.xaxis.get_majorticklabels(), rotation=0, ha='center', fontsize=11, color='black')
    plt.setp(ax1.xaxis.get_minorticklabels(), rotation=90, ha='center', fontsize=8, color='gray')

    # X-axis limits
    ax1.set_xlim(df['time'].min(), df['time'].max())

    # Title
    plt.title(
        f"{crypto_currency} Close Price & Sentiment Analysis",
        fontsize=16,
        fontweight='bold',
        pad=15
    )

    plt.tight_layout()
    plt.show()


In [42]:
coin_dict.keys()

dict_keys(['Bitcoin', 'Ethereum', 'Tether', 'Binance Coin', 'USD Coin', 'Solana', 'Cardano', 'Dogecoin', 'Toncoin', 'Polkadot', 'Litecoin', 'Chainlink', 'Uniswap', 'Bitcoin Cash', 'Stellar', 'VeChain', 'TRON', 'Filecoin', 'Aave'])

In [16]:
# for crypto_currency in coin_dict.keys():
master_merge_df = pd.DataFrame(columns=['week_year', 'close', 'time',
                                        'moving_avg_vader_mean', 'crypto_coin'])
for crypto_currency in ["Bitcoin", "Ethereum", "Solana", "Cardano"]:
  print(crypto_currency)
  coin_df = coin_dict[crypto_currency]
  coin_df['week_year'] = coin_df["time"].dt.strftime('%Y-%W')

  weekly_bitcin_df = (
      coin_df
      .groupby('week_year', as_index=False)
      .agg(close=('close', 'last'), time=('time', 'first'))
  )

  merged_df = weekly_bitcin_df.merge(weekly_df, on='week_year')
  merged_df['moving_avg_vader_mean'] = merged_df['vader_mean'].rolling(window=4).mean()
  merged_df = merged_df.loc[(merged_df['time'] >= datetime(2018, 11, 1)) &
                            (merged_df['time'] <= datetime(2025, 6, 1))]
  # Calculate the Pearson correlation coefficient
  correlation, p_value = stats.pearsonr(merged_df['close'], merged_df['vader_mean'])
  merged_df['crypto_coin'] = crypto_currency
  print(f"The Pearson correlation coefficient between close price \
  and vader_mean is: {correlation}\nP-Value: {p_value}")
  master_merge_df = pd.concat([master_merge_df, merged_df])
  # plot_crypto_sentiment(merged_df, crypto_currency)


Bitcoin
The Pearson correlation coefficient between close price   and vader_mean is: 0.23458287115002172
P-Value: 9.682773060493338e-05
Ethereum



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



The Pearson correlation coefficient between close price   and vader_mean is: 0.25344476134634336
P-Value: 2.420586339691795e-05
Solana
The Pearson correlation coefficient between close price   and vader_mean is: 0.08783021840409583
P-Value: 0.19745394496451893
Cardano
The Pearson correlation coefficient between close price   and vader_mean is: 0.15607917581814723
P-Value: 0.010073372446385991


##Buid Dashboard

In [27]:
last_date = master_merge_df['time'].max()
print(last_date)

2025-05-26 00:00:00


In [18]:
import dash
from dash import dcc, html
import plotly.graph_objs as go

# -----------------------------
# Dashboard App
# -----------------------------
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Crypto Price & Sentiment Dashboard", style={'textAlign': 'center'}),

    # Dropdown for selecting crypto
    html.Label("Select Cryptocurrency:"),
    dcc.Dropdown(
        id='crypto-dropdown',
        options=[
            {'label': 'Bitcoin', 'value': 'Bitcoin'},
            {'label': 'Ethereum', 'value': 'Ethereum'},
            {'label': 'Solana', 'value': 'Solana'},
            {'label': 'Cardano', 'value': 'Cardano'},
        ],
        value='Bitcoin',
        clearable=False,
        style={'width': '40%'}
    ),

    # Graphs
    dcc.Graph(id='crypto-sentiment-graph'),
    dcc.Graph(id='crypto-barplot')
])

# -----------------------------
# Callback for line + sentiment chart
# -----------------------------
@app.callback(
    dash.Output('crypto-sentiment-graph', 'figure'),
    [dash.Input('crypto-dropdown', 'value')]
)
def update_graph(crypto_currency):
    df = master_merge_df.loc[master_merge_df['crypto_coin'] == crypto_currency]

    fig = go.Figure()

    # Close Price
    fig.add_trace(go.Scatter(
        x=df['time'], y=df['close'],
        name="Close Price",
        line=dict(color="#008080", width=2.5),
        fill='tozeroy',
        yaxis="y1"
    ))

    # Sentiment
    fig.add_trace(go.Scatter(
        x=df['time'], y=df['moving_avg_vader_mean'],
        name="Sentiment (MA)",
        line=dict(color="#FF7F0E", width=2.5, dash="dash"),
        yaxis="y2"
    ))

    # Layout with two y-axes
    fig.update_layout(
        title=f"{crypto_currency} Close Price & Sentiment Over Time",
        xaxis=dict(
            title="Date",
            showgrid=True,
            dtick="M1",
            tickformat="%b\n%Y"
        ),
        yaxis=dict(
            title="Close Price (USD)",
            titlefont=dict(color="#008080"),
            tickfont=dict(color="#008080"),
            side="left"
        ),
        yaxis2=dict(
            title="Sentiment",
            titlefont=dict(color="#FF7F0E"),
            tickfont=dict(color="#FF7F0E"),
            overlaying="y",
            side="right",
            tickformat=".0%"
        ),
        legend=dict(x=0.01, y=0.99, bgcolor="rgba(255,255,255,0.7)"),
        template="plotly_white",
        height=600
    )

    return fig

# -----------------------------
# Callback for bar plot
# -----------------------------
@app.callback(
    dash.Output('crypto-barplot', 'figure'),
    [dash.Input('crypto-dropdown', 'value')]
)
def update_barplot(selected_crypto):
    # Get the last available date
    last_date = master_merge_df['time'].max()
    one_month_ago = last_date - pd.DateOffset(months=1)

    # Filter last month
    last_month_df = master_merge_df[master_merge_df['time'] >= one_month_ago]

    # Compute stats
    stats_df = last_month_df.groupby('crypto_coin').agg(
        latest_close=('close', 'last'),
        std_close=('close', 'std')
    ).reset_index()

    # Bar plot
    fig = go.Figure(data=[
        go.Bar(
            x=stats_df['crypto_coin'],
            y=stats_df['latest_close'],
            name="Latest Close Price",
            marker_color="#008080",
            yaxis="y1"
        ),
        go.Bar(
            x=stats_df['crypto_coin'],
            y=stats_df['std_close'],
            name="Std Dev (Last Month)",
            marker_color="#FF7F0E",
            yaxis="y2"
        )
    ])

    # Dual y-axes: one for price, one for std dev
    fig.update_layout(
        title="Latest Close Prices & Volatility (Std Dev Last Month)",
        xaxis=dict(title="Cryptocurrency"),
        yaxis=dict(
            title="Latest Close Price (USD)",
            titlefont=dict(color="#008080"),
            tickfont=dict(color="#008080"),
        ),
        yaxis2=dict(
            title="Standard Deviation",
            titlefont=dict(color="#FF7F0E"),
            tickfont=dict(color="#FF7F0E"),
            overlaying="y",
            side="right"
        ),
        barmode="group",
        template="plotly_white",
        height=500,
        legend=dict(x=0.01, y=0.99, bgcolor="rgba(255,255,255,0.7)")
    )

    return fig


# -----------------------------
# Run app
# -----------------------------
if __name__ == '__main__':
    app.run(debug=True)


<IPython.core.display.Javascript object>