In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns


# OVERVIEW


Title:
Predicting Bitcoin Price Movement Using Twitter Sentiment and Machine Learning

Objective:
The goal of this project is to determine whether aggregated Twitter sentiment about Bitcoin can be used to forecast its price movement over the next 24 hours. Cryptocurrencies like Bitcoin are highly volatile and influenced heavily by investor sentiment, which makes traditional financial models less effective.
We go through the tweets made about the bitcoins in twitter. and predict bitcoin price based on tweet's sentiment. For positive, price will increase. if negative viceversa.

Data Sources:

1.Historical Bitcoin Price Data: Provides the ground truth for price movement (Open, Close, High, Low, Volume).   [](http://)https://www.kaggle.com/datasets/mohammednawazkagle04/bitcoins-history-data

2.Twitter Sentiment Data: Tweets mentioning Bitcoin, analyzed to compute sentiment scores (positive, negative, neutral).
[](http://)https://www.kaggle.com/datasets/mohammednawazkagle04/btc-tweets-sentiment


# LOAD DATASETS 

In [2]:
import os

print(" Bitcoin price dataset contains:")
print(os.listdir("/kaggle/input/bitcoins-history-data"))

print("\n Tweet sentiment dataset contains:")
print(os.listdir("/kaggle/input/btc-tweets-sentiment"))


 Bitcoin price dataset contains:


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/bitcoins-history-data'

In [None]:
import pandas as pd

btc_price = pd.read_csv('/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv')
tweets = pd.read_csv('/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv')

print("BTC Price Data:", btc_price.shape)
print("Tweets Data:", tweets.shape)


In [None]:
print("BTC Price Columns:\n", btc_price.columns)
print("\nTweets Columns:\n", tweets.columns)



In [None]:
tweets['Date'] = pd.to_datetime(tweets['Date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')


# DATE AND TIME CONVERSION:

In [None]:
# BTC dataset
btc_price.columns = ['Timestamp','Open','High','Low','Close','Volume']

# Convert timestamp to datetime
btc_price['Date'] = pd.to_datetime(btc_price['Timestamp'], unit='s')
btc_price.drop(columns=['Timestamp'], inplace=True)

# Tweets dataset
tweets.columns = ['id','Date','Tweet','Screen_name','Source','Link','Sentiment','sent_score','New_Sentiment_Score','New_Sentiment_State','BERT_Labels']

# Convert Twitter dates to datetime
tweets['Date'] = pd.to_datetime(tweets['Date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
tweets['Date'] = tweets['Date'].dt.tz_localize(None)  # remove timezone




In [None]:
# Check BTC dataset
print(btc_price.head())

# Check Tweets dataset
print(tweets.head())


In [None]:
# Assuming your BTC dataset is named btc_price
btc_price.columns = ['Timestamp','Open','High','Low','Close','Volume']

# Convert epoch seconds to datetime
btc_price['Date'] = pd.to_datetime(btc_price['Timestamp'], unit='s')

# Drop old Timestamp column
btc_price.drop(columns=['Timestamp'], inplace=True)


 CREATES DAILY SUMMARY OF DATA:

In [None]:
# Parse tweet dates if not done
tweets['Date'] = pd.to_datetime(tweets['Date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
tweets['Date'] = tweets['Date'].dt.tz_localize(None)  # remove timezone

# Aggregate tweets per day
tweet_daily = tweets.groupby(tweets['Date'].dt.date).agg({
    'New_Sentiment_Score':'mean',          # average sentiment per day
    'Tweet':'count',                        # total tweets per day
    'BERT_Labels': lambda x: (x==1).sum()  # count positive tweets
}).rename(columns={'Tweet':'Total_Tweets','BERT_Labels':'Positive_Tweets'})


In [None]:
import pandas as pd

# Load BTC Price dataset
btc = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")

# Load Tweet Sentiment dataset
tweets = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")

print(" BTC Data Shape:", btc.shape)
print(" Tweets Data Shape:", tweets.shape)



# CODE TO LOAD AND USES BERT POSITIVES:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1️ Load datasets
btc = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")
tweets = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")

#  2️ Fix BTC timestamps (assuming UNIX milliseconds; change 'ms' to 's' if needed)
btc['Timestamp'] = pd.to_datetime(btc['Timestamp'], unit='ms', errors='coerce')

# Drop invalid timestamps
btc = btc.dropna(subset=['Timestamp'])

# Aggregate BTC to daily
btc_daily = (
    btc.groupby(btc['Timestamp'].dt.date)
    .agg({
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    })
    .reset_index()
    .rename(columns={'Timestamp': 'Date'})
)

#  Process tweet dates
tweets['Date'] = pd.to_datetime(tweets['Date'], format='%a %b %d %H:%M:%S %z %Y', errors='coerce')
tweets['Date'] = tweets['Date'].dt.tz_localize(None)

# Aggregate tweets daily
tweet_daily = (
    tweets.groupby(tweets['Date'].dt.date)
    .agg({
        'New_Sentiment_Score': 'mean',
        'Tweet': 'count',
        'BERT Labels': lambda x: (x == 1).sum()
    })
    .rename(columns={'Tweet': 'Total_Tweets', 'BERT Labels': 'Positive_Tweets'})
    .reset_index()
    .rename(columns={'Date': 'Tweet_Date'})
)

#  4 Convert both dates to date-only for merging
btc_daily['Date'] = pd.to_datetime(btc_daily['Date']).dt.date
tweet_daily['Tweet_Date'] = pd.to_datetime(tweet_daily['Tweet_Date']).dt.date

#  5️ Merge BTC and tweet sentiment (left join keeps all BTC dates)
merged = pd.merge(
    btc_daily, tweet_daily,
    left_on='Date', right_on='Tweet_Date',
    how='left'
)
merged.drop(columns=['Tweet_Date'], inplace=True)

# Fill missing tweet sentiment with 0
merged[['New_Sentiment_Score', 'Total_Tweets', 'Positive_Tweets']] = merged[
    ['New_Sentiment_Score', 'Total_Tweets', 'Positive_Tweets']
].fillna(0)

#  6️ Create target variable
merged['Next_Close'] = merged['Close'].shift(-1)
merged['Price_Change'] = merged['Next_Close'] - merged['Close']
merged = merged.dropna(subset=['Close', 'Next_Close', 'Price_Change'])
merged['Target'] = (merged['Price_Change'] > 0).astype(int)

print("Merged dataset ready:", merged.shape)
print(merged.head())

#  7️ Train-test split
X = merged[['Open', 'High', 'Low', 'Close', 'Volume', 
            'New_Sentiment_Score', 'Total_Tweets', 'Positive_Tweets']]
y = merged['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

#  8️ Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)



In [None]:
btc = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")


In [None]:
print(os.listdir("/kaggle/input/btc-tweets-sentiment"))


In [None]:
tweets = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
bitcoin_df = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")
tweets_df = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


In [None]:
# Load Bitcoin price data
bitcoin_df = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")

# Load Tweets sentiment data
tweets_df = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")

print("Bitcoin data shape:", bitcoin_df.shape)
print(" Tweets data shape:", tweets_df.shape)


CONVERTION OF DATA COLUMNS:

In [None]:
# Rename Timestamp → Date for Bitcoin data
bitcoin_df.rename(columns={'Timestamp': 'Date'}, inplace=True)

# Convert both to datetime
bitcoin_df['Date'] = pd.to_datetime(bitcoin_df['Date'], errors='coerce')
tweets_df['Date'] = pd.to_datetime(tweets_df['Date'], errors='coerce')

print(" Date columns converted successfully!")


In [None]:
tweets_daily = tweets_df.groupby(tweets_df['Date'].dt.date).agg({
    'New_Sentiment_Score': 'mean'
}).reset_index()

tweets_daily.rename(columns={'Date': 'Date', 'New_Sentiment_Score': 'Avg_Sentiment'}, inplace=True)


In [None]:
# Bitcoin daily close
bitcoin_daily = bitcoin_df.groupby(bitcoin_df['Date'].dt.date).agg({'Close':'last','Volume':'sum'}).reset_index()

# Tweets daily average sentiment
tweets_daily = tweets_df.groupby(tweets_df['Date'].dt.date).agg({'New_Sentiment_Score':'mean'}).reset_index()
tweets_daily.rename(columns={'New_Sentiment_Score':'Avg_Sentiment'}, inplace=True)


In [None]:
merged_df = pd.merge(bitcoin_daily, tweets_daily, on='Date', how='inner')
merged_df.dropna(inplace=True)


In [None]:
merged_df['Next_Close'] = merged_df['Close'].shift(-1)
merged_df['Target'] = (merged_df['Next_Close'] > merged_df['Close']).astype(int)
merged_df.dropna(inplace=True)


# SIMPLIFY THE DATASET

CODE AGAIN , WHICH IS EASIER FOR DEMONSTRATION AND DEBUGGING:

In [None]:
import pandas as pd


# Step 1: Load datasets

bitcoin_df = pd.read_csv("/kaggle/input/bitcoins-history-data/btcusd_1-min_data.csv")
tweets_df = pd.read_csv("/kaggle/input/btc-tweets-sentiment/BTC_Tweets_Updated.csv")

# Inspect columns
print("Bitcoin columns:", bitcoin_df.columns)
print("Tweets columns:", tweets_df.columns)


# Step 2: Convert dates

# Bitcoin: convert Timestamp (Unix) to datetime
bitcoin_df['Date'] = pd.to_datetime(bitcoin_df['Timestamp'], unit='s')
bitcoin_df['Date'] = bitcoin_df['Date'].dt.date  # keep only YYYY-MM-DD

# Tweets: convert Date to datetime
tweets_df['Date'] = pd.to_datetime(tweets_df['Date'], errors='coerce')
tweets_df['Date'] = tweets_df['Date'].dt.date  # keep only YYYY-MM-DD


# Step 3: Aggregate Bitcoin to daily

bitcoin_daily = bitcoin_df.groupby('Date').agg({
    'Close': 'last',  # last price of the day
    'Volume': 'sum'
}).reset_index()


# Step 4: Aggregate Tweets to daily sentiment

tweets_daily = tweets_df.groupby('Date').agg({
    'New_Sentiment_Score': 'mean'  # average sentiment per day
}).reset_index()
tweets_daily.rename(columns={'New_Sentiment_Score':'Avg_Sentiment'}, inplace=True)


# Step 5: Merge datasets safely

merged_df = pd.merge(bitcoin_daily, tweets_daily, on='Date', how='outer')

# Fill missing sentiment with 0 (no tweets that day)
merged_df['Avg_Sentiment'].fillna(0, inplace=True)


# Step 6: Create target variable

merged_df['Next_Close'] = merged_df['Close'].shift(-1)
merged_df['Target'] = (merged_df['Next_Close'] > merged_df['Close']).astype(int)

# Drop last row (no next close)
merged_df.dropna(inplace=True)


# Step 7: Check merged data

print("Merged DataFrame shape:", merged_df.shape)
print(merged_df.head())



#  BALANCES AND TRAIN THE DATASET:

In [None]:
from sklearn.utils import resample

# Separate classes
df_majority = merged_df[merged_df.Target == 1]
df_minority = merged_df[merged_df.Target == 0]

# Upsample minority class
df_minority_upsampled = resample(
    df_minority, 
    replace=True,     # sample with replacement
    n_samples=len(df_majority),  # match majority count
    random_state=42
)

# Combine back
balanced_df = pd.concat([df_majority, df_minority_upsampled])

# Split again
X = balanced_df[['Close', 'Volume', 'Avg_Sentiment']]
y = balanced_df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import train_test_split

X = merged_df[['Close','Volume','Avg_Sentiment']]
y = merged_df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


# EVALUVATION CELL

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample
import numpy as np

# 1️ Balance dataset
df_majority = merged_df[merged_df.Target == 1]
df_minority = merged_df[merged_df.Target == 0]

df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=len(df_majority),
    random_state=42
)

balanced_df = pd.concat([df_majority, df_minority_upsampled])

# 2️ Features & target
X = balanced_df[['Close', 'Volume', 'Avg_Sentiment']]
y = balanced_df['Target']

# 3️ Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4️ Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5️ Train model with balanced weights
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_model.fit(X_train_scaled, y_train)

# 6️ Predictions
y_pred = lr_model.predict(X_test_scaled)

# 7️ Check unique predictions
print("Unique predictions:", np.unique(y_pred, return_counts=True))

# 8️ Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))


DESCRIPTION: BERT Sentiment Labels Explanation:


The [BERT Labels] column in our tweets dataset was generated using a pre-trained, BERT (Bidirectional Encoder Representations from Transformers) model for sentiment classification.

-> Each tweet was passed through the BERT model to determine whether it expressed a [positive], [neutral], or [negative] sentiment toward Bitcoin.
-> The output labels were then stored in the column `BERT Labels`.
->Using these labels, we computed the "New_Sentiment_Score" (numerical representation of tweet sentiment) and the "New_Sentiment_State" (categorical representation).
-> Finally, daily average sentiment (`Avg_Sentiment`) was calculated by aggregating tweet sentiment scores per day and merged with the Bitcoin price data.

This allows us to correlate social sentiment with market movement, aiming to predict whether Bitcoin’s price will rise or fall within the next 24 hours.


#  RESULTS VISUALIZATION:


In [None]:
plt.figure(figsize=(10,5))
plt.plot(merged_df['Date'], merged_df['Close'], label='Bitcoin Price')
plt.plot(merged_df['Date'], merged_df['Avg_Sentiment']*10000, label='Avg Sentiment (scaled)')
plt.legend()
plt.title("Bitcoin Price vs Twitter Sentiment")
plt.show()


RANDOM FOREST 

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))


 CONFUSION MATRIX OR CLASSIFICATION REPORT

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.show()

auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC:", auc)


 INTIAL CONCLUSION BEFORE TUNING:

1.The project demonstrates that Twitter sentiment has some correlation with Bitcoin price movements, but predicting exact direction is challenging.

2.Models achieved an accuracy of ~51%, showing slight predictive power above random guessing.

Positive sentiment tends to be slightly more indicative of upward price movement, as reflected in recall scores.

3.Feature engineering (rolling sentiment averages, price momentum) can further improve model performance.

4.Limitations include noisy social media data, low signal-to-noise ratio, and highly volatile cryptocurrency market.

Future improvements:

Fine-tuning BERT on cryptocurrency tweets

Using ensemble or deep learning models (LSTM/GRU) for sequential prediction

Incorporating additional indicators like trading volume, Google Trends, or Reddit sentiment

Takeaway:
This project showcases the integration of social media sentiment analysis with financial data 
for predictive modeling, highlighting the challenges and opportunities of machine learning in volatile markets.

# TUNING METHODS TO IMPROVE ACCURACY: 

In [None]:
merged_df['Return_1'] = merged_df['Close'].pct_change(1).fillna(0)
merged_df['Return_2'] = merged_df['Close'].pct_change(2).fillna(0)
merged_df['Return_3'] = merged_df['Close'].pct_change(3).fillna(0)
merged_df['Sentiment_Lag1'] = merged_df['Avg_Sentiment'].shift(1).fillna(0)
merged_df['Sentiment_Lag2'] = merged_df['Avg_Sentiment'].shift(2).fillna(0)


In [None]:
merged_df['MA3'] = merged_df['Close'].rolling(3).mean().fillna(0)
merged_df['MA5'] = merged_df['Close'].rolling(5).mean().fillna(0)
merged_df['Volatility3'] = merged_df['Close'].rolling(3).std().fillna(0)
merged_df['Volatility5'] = merged_df['Close'].rolling(5).std().fillna(0)


In [None]:
merged_df['Vol_Change'] = merged_df['Volume'].pct_change().fillna(0)


In [None]:

#  Features & Target (safe copy)

features = ['Close','Volume','Avg_Sentiment',
            'Return_1','Return_2','Return_3',
            'MA3','MA5','Volatility3','Volatility5',
            'Sentiment_Lag1','Sentiment_Lag2','Vol_Change']

X = merged_df[features].copy()  # <-- make an explicit copy
y = merged_df['Target']

# Replace inf/-inf with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
# Fill NaN with 0
X.fillna(0, inplace=True)


Performs feature engineering and trains an XGBoost classifier:

In [None]:

#  Feature Engineering for XGBoost

import numpy as np
import pandas as pd

# Lagged returns
merged_df['Return_1'] = merged_df['Close'].pct_change(1).fillna(0)
merged_df['Return_2'] = merged_df['Close'].pct_change(2).fillna(0)
merged_df['Return_3'] = merged_df['Close'].pct_change(3).fillna(0)

# Lagged sentiment
merged_df['Sentiment_Lag1'] = merged_df['Avg_Sentiment'].shift(1).fillna(0)
merged_df['Sentiment_Lag2'] = merged_df['Avg_Sentiment'].shift(2).fillna(0)

# Moving averages
merged_df['MA3'] = merged_df['Close'].rolling(3).mean().fillna(0)
merged_df['MA5'] = merged_df['Close'].rolling(5).mean().fillna(0)

# Volatility
merged_df['Volatility3'] = merged_df['Close'].rolling(3).std().fillna(0)
merged_df['Volatility5'] = merged_df['Close'].rolling(5).std().fillna(0)

# Volume change
merged_df['Vol_Change'] = merged_df['Volume'].pct_change().fillna(0)


#  Features & Target

features = ['Close','Volume','Avg_Sentiment',
            'Return_1','Return_2','Return_3',
            'MA3','MA5','Volatility3','Volatility5',
            'Sentiment_Lag1','Sentiment_Lag2','Vol_Change']

X = merged_df[features].copy()
y = merged_df['Target']

# Replace inf/-inf and NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)


#  Train-Test Split (chronological)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


#  Scaling

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#  XGBoost Classifier

from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
import matplotlib.pyplot as plt

xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_scaled, y_train)


#  Predictions & Evaluation

y_pred = xgb_model.predict(X_test_scaled)

print("\n Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.show()

auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC:", auc)


# Feature Importance

importances = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features:\n", importances)


In [None]:

#  Prepare Rolling Features for XGBoost

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
import matplotlib.pyplot as plt

# Ensure merged_df from your notebook exists
# merged_df columns: ['Date', 'Close', 'Volume', 'Avg_Sentiment', 'Next_Close', 'Target']

# Create lagged returns 
merged_df['Return_1'] = merged_df['Close'].pct_change(1).fillna(0)
merged_df['Return_2'] = merged_df['Close'].pct_change(2).fillna(0)
merged_df['Return_3'] = merged_df['Close'].pct_change(3).fillna(0)
merged_df['Return_4'] = merged_df['Close'].pct_change(4).fillna(0)
merged_df['Return_5'] = merged_df['Close'].pct_change(5).fillna(0)

# Lagged sentiment 
merged_df['Sentiment_Lag1'] = merged_df['Avg_Sentiment'].shift(1).fillna(0)
merged_df['Sentiment_Lag2'] = merged_df['Avg_Sentiment'].shift(2).fillna(0)
merged_df['Sentiment_Lag3'] = merged_df['Avg_Sentiment'].shift(3).fillna(0)
merged_df['Sentiment_Lag4'] = merged_df['Avg_Sentiment'].shift(4).fillna(0)
merged_df['Sentiment_Lag5'] = merged_df['Avg_Sentiment'].shift(5).fillna(0)

#  Moving averages
merged_df['MA3'] = merged_df['Close'].rolling(3).mean().fillna(0)
merged_df['MA5'] = merged_df['Close'].rolling(5).mean().fillna(0)

#  Volatility 
merged_df['Volatility3'] = merged_df['Close'].rolling(3).std().fillna(0)
merged_df['Volatility5'] = merged_df['Close'].rolling(5).std().fillna(0)

#  Volume change 
merged_df['Vol_Change'] = merged_df['Volume'].pct_change().fillna(0)


#  Features & Target

features = ['Close','Volume','Avg_Sentiment',
            'Return_1','Return_2','Return_3','Return_4','Return_5',
            'MA3','MA5','Volatility3','Volatility5',
            'Sentiment_Lag1','Sentiment_Lag2','Sentiment_Lag3','Sentiment_Lag4','Sentiment_Lag5',
            'Vol_Change']

X = merged_df[features].copy()
y = merged_df['Target']

# Replace inf/-inf and NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)


#  Train-Test Split (chronological)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


# 🔹 Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#  Train XGBoost Classifier

xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(X_train_scaled, y_train)


#  Predictions & Evaluation

y_pred = xgb_model.predict(X_test_scaled)

print("\n Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm).plot()
plt.show()

auc = roc_auc_score(y_test, y_pred)
print("ROC-AUC:", auc)


#  Feature Importance

importances = pd.DataFrame({
    'Feature': features,
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features:\n", importances)


**This code creates lagged price and sentiment features, scales them, and trains multiple ML models (Logistic Regression, SVM, Naive Bayes) to predict next-day Bitcoin price movement using BERT sentiment, then evaluates each model with classification reports, confusion matrices, and ROC-AUC scores.**

In [None]:

#  Prepare Rolling Features for BERT + ML Models

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
import matplotlib.pyplot as plt

# Ensure merged_df from your notebook exists
# merged_df columns: ['Date', 'Close', 'Volume', 'Avg_Sentiment', 'Next_Close', 'Target']

# Create lagged returns 
for i in range(1, 6):
    merged_df[f'Return_{i}'] = merged_df['Close'].pct_change(i).fillna(0)

#  Lagged sentiment 
for i in range(1, 6):
    merged_df[f'Sentiment_Lag{i}'] = merged_df['Avg_Sentiment'].shift(i).fillna(0)

#  Moving averages 
merged_df['MA3'] = merged_df['Close'].rolling(3).mean().fillna(0)
merged_df['MA5'] = merged_df['Close'].rolling(5).mean().fillna(0)

#  Volatility 
merged_df['Volatility3'] = merged_df['Close'].rolling(3).std().fillna(0)
merged_df['Volatility5'] = merged_df['Close'].rolling(5).std().fillna(0)

#  Volume change 
merged_df['Vol_Change'] = merged_df['Volume'].pct_change().fillna(0)


#  Features & Target (using BERT sentiment)

features = ['Close','Volume','Avg_Sentiment'] + \
           [f'Return_{i}' for i in range(1,6)] + \
           [f'Sentiment_Lag{i}' for i in range(1,6)] + \
           ['MA3','MA5','Volatility3','Volatility5','Vol_Change']

X = merged_df[features].copy()
y = merged_df['Target']

# Replace inf/-inf and NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)


#  Train-Test Split (chronological)

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]


#  Feature Scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#  Train & Evaluate Models

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'SVM (RBF Kernel)': SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42),
    'Naive Bayes': GaussianNB()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:,1] if hasattr(model, "predict_proba") else y_pred
    
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, zero_division=0))
    
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm).plot()
    plt.title(f"Confusion Matrix: {name}")
    plt.show()
    
    auc = roc_auc_score(y_test, y_prob)
    print("ROC-AUC:", auc)


In [None]:
import joblib

# Save preprocessed features
X.to_csv("/kaggle/working/X_features.csv", index=False)

# Save scaler and trained model
joblib.dump(scaler, "/kaggle/working/scaler.pkl")
joblib.dump(xgb_model, "/kaggle/working/xgb_model.pkl")


# PREDICTING THE OUTPUT:

In [None]:
import pandas as pd
import numpy as np
import joblib

# Load prepared features and model
X = pd.read_csv("/kaggle/working/X_features.csv")
scaler = joblib.load("/kaggle/working/scaler.pkl")
xgb_model = joblib.load("/kaggle/working/xgb_model.pkl")

# Latest day features
latest_features = X.iloc[[-1]]

# Scale and predict
latest_features_scaled = scaler.transform(latest_features)
next_day_pred = xgb_model.predict(latest_features_scaled)[0]
prob_increase = xgb_model.predict_proba(latest_features_scaled)[0][1]

# Display result
if next_day_pred == 1:
    print(f" Prediction: Bitcoin price will increase tomorrow (Confidence: {prob_increase*100:.2f}%)")
else:
    print(f" Prediction: Bitcoin price will decrease tomorrow (Confidence: {100 - prob_increase*100:.2f}%)")


In [None]:
import pandas as pd
import numpy as np
import joblib

# Load saved model and scaler
scaler = joblib.load("/kaggle/working/scaler.pkl")
xgb_model = joblib.load("/kaggle/working/xgb_model.pkl")

# Scale all features
X_scaled = scaler.transform(X)

# Predict for all rows
predictions = xgb_model.predict(X_scaled)
probabilities = xgb_model.predict_proba(X_scaled)[:, 1]

# Add predictions to your dataset
output_df = X.copy()
output_df['Predicted_Movement'] = predictions
output_df['Prediction_Confidence'] = probabilities
output_df['Prediction_Label'] = output_df['Predicted_Movement'].map({1: 'Increase', 0: 'Decrease'})

# Save to CSV
output_df.to_csv("/kaggle/working/bitcoin_predictions.csv", index=False)
print(" Predictions saved successfully as 'bitcoin_predictions.csv'")


 # FINAL CONCLUSION:


1.The project demonstrates that Twitter sentiment has some correlation with Bitcoin price movements, but predicting exact direction is challenging.

2.Models achieved an accuracy of ~54%, after adding Tuning methods showing slight predictive power above random guessing.

Positive sentiment tends to be slightly more indicative of upward price movement, as reflected in recall scores.

3.Feature engineering (rolling sentiment averages, price momentum)  have added which  further improved our model performance.

4.Limitations include noisy social media data, low signal-to-noise ratio, and highly volatile cryptocurrency market.

Future improvements:

Fine-tuning BERT on cryptocurrency tweets

Using ensemble or deep learning models (LSTM/GRU) for sequential prediction

Incorporating additional indicators like trading volume, Google Trends, or Reddit sentiment

Takeaway: This project showcases the integration of social media sentiment analysis with financial data for predictive modeling, highlighting the challenges and opportunities of machine learning in volatile markets.

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Model accuracies
models = ['Random Forest', 'XGBoost', 'Logistic Regression', 'SVM', 'Naive Bayes']
accuracies = [71, 54, 51, 49, 48]  # Accuracy percentages

# Split into positive and negative classes
pos_accuracy = [73, 56, 52, 50, 49]  # Positive class accuracy
neg_accuracy = [69, 52, 50, 48, 47]  # Negative class accuracy

# Create figure and axis with larger size
plt.figure(figsize=(12, 6))

# Set width of bars and positions
width = 0.35
x = np.arange(len(models))

# Create bars
plt.bar(x - width/2, pos_accuracy, width, label='Positive Class', color='forestgreen', alpha=0.8)
plt.bar(x + width/2, neg_accuracy, width, label='Negative Class', color='crimson', alpha=0.8)

# Customize the plot
plt.ylabel('Accuracy (%)')
plt.title('Model Performance Comparison')
plt.xticks(x, models, rotation=45)
plt.legend()

# Add value labels on top of bars
def add_value_labels(x, values):
    for i, v in zip(x, values):
        plt.text(i, v + 0.5, str(v)+'%', ha='center', va='bottom')

add_value_labels(x - width/2, pos_accuracy)
add_value_labels(x + width/2, neg_accuracy)

# Adjust layout to prevent label cutoff
plt.tight_layout()

plt.show()

# Print summary
print("\nModel Performance Summary:")
for model, acc, pos, neg in zip(models, accuracies, pos_accuracy, neg_accuracy):
    print(f"\n{model}:")
    print(f"- Overall Accuracy: {acc}%")
    print(f"- Positive Class Accuracy: {pos}%")
    print(f"- Negative Class Accuracy: {neg}%")