In [1]:
import torch
print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0))

PyTorch Version: 2.5.1+cu121
CUDA Available: True
CUDA Version: 12.1
GPU: NVIDIA GeForce RTX 5090


NVIDIA GeForce RTX 5090 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
If you want to use the NVIDIA GeForce RTX 5090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('Bitcoin_tweets.csv',usecols=["date", "text"],
    low_memory=False,
    lineterminator="\n")

In [4]:
df.shape

(4689354, 2)

In [5]:
df['date'] = pd.to_datetime(df['date'],errors='coerce')

In [6]:
df['minute'] = df['date'].dt.floor('T') 

  df['minute'] = df['date'].dt.floor('T')


In [7]:
aggregated_df = df.groupby('minute')['text'].apply(lambda x: ' | '.join(x)).reset_index()

In [8]:
df = aggregated_df

In [9]:
df.head()

Unnamed: 0,minute,text
0,2021-02-05 10:52:00,#reddcoin #rdd @reddcoin to the moon #altcoin ...
1,2021-02-05 10:53:00,#Bitcoin and #ETH both have bullish setups for...
2,2021-02-05 10:54:00,$PERL 0.06.\nI have insisted that since 0.02 i...
3,2021-02-05 10:57:00,#Amazing 😍\n#Monopoly #Crypto #cryptocurrency ...
4,2021-02-05 10:58:00,"#Bitcoin braces for $48,000 as inverse head-an..."


In [10]:
df.tail()

Unnamed: 0,minute,text
233882,2023-01-09 23:55:00,#Bitcoin￼ is either the most significant inven...
233883,2023-01-09 23:56:00,Top 10 coins by 1-week #GalaxyScore™ via @Luna...
233884,2023-01-09 23:57:00,1₿ = $17198.76 0.73%📈\n\nDetails:\nChange: 📈12...
233885,2023-01-09 23:58:00,$BTC will come back to life in 2023!\n\nDo you...
233886,2023-01-09 23:59:00,BECAUSE IT'S A LIFE\nCHANGING\nOPPORTUNITY FOR...


In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

In [25]:
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"  # 3-class sentiment model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

In [26]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [27]:
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length',max_length=512).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        scores = softmax(outputs.logits, dim=1).tolist()[0]
    
    # Map highest probability to sentiment labels
    sentiment_mapping = {0: -1, 1: 0, 2: 1}  # Negative = -1, Neutral = 0, Positive = 1
    predicted_class = scores.index(max(scores))  # Get index of highest probability
    sentiment_score = sentiment_mapping[predicted_class]  # Map to -1, 0, 1
    
    return sentiment_score, scores


In [28]:
df[['sentiment', 'scores']] = df['text'].apply(lambda x: pd.Series(get_sentiment(x)))

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Assume you have a DataFrame `df` with columns 'date', 'text', and 'sentiment'
# where 'sentiment' is a numeric sentiment score (e.g., -1 to 1 or 0 to 10)

# Split the data into features and target
X = df['text']
y = df['sentiment']

# Option 1: Create a pipeline to chain TF-IDF and Lasso regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=1000)),
    ('lasso', Lasso())
])

# Optionally, use GridSearchCV to tune the regularization parameter alpha
param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

print("Best alpha:", grid_search.best_params_['lasso__alpha'])
print("Best CV score (negative MSE):", grid_search.best_score_)

# Option 2: Manual train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the pipeline with a chosen alpha value
pipeline.set_params(lasso__alpha=grid_search.best_params_['lasso__alpha'])
pipeline.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

# To inspect the model's coefficients (feature importance):
# Get the fitted TfidfVectorizer and Lasso model
tfidf = pipeline.named_steps['tfidf']
lasso_model = pipeline.named_steps['lasso']
feature_names = tfidf.get_feature_names_out()
coefficients = lasso_model.coef_

# Create a DataFrame for easier interpretation
coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
})

# Display features with non-zero coefficients, sorted by absolute value
important_features = coef_df[coef_df['coefficient'] != 0].copy()
important_features['abs_coef'] = important_features['coefficient'].abs()
important_features = important_features.sort_values(by='abs_coef', ascending=False)
print(important_features.head(10))


In [11]:
df_prices = pd.read_csv('data.csv')

In [13]:
df_prices.head(20)

Unnamed: 0,Timestamp,Date,Symbol,Open,High,Low,Close,Volume BTC,Volume USD
0,1676939580000,2023-02-21 00:33:00,BTC/USD,24859.34,24859.34,24859.34,24859.34,0.0,0.0
1,1676939520000,2023-02-21 00:32:00,BTC/USD,24821.96,24859.34,24821.96,24859.34,0.103099,2562.977818
2,1676939460000,2023-02-21 00:31:00,BTC/USD,24818.09,24821.96,24815.47,24821.96,0.09064,2249.866178
3,1676939400000,2023-02-21 00:30:00,BTC/USD,24812.25,24818.09,24812.25,24818.09,0.002203,54.68145
4,1676939340000,2023-02-21 00:29:00,BTC/USD,24809.27,24812.25,24809.27,24812.25,0.090675,2249.862431
5,1676939280000,2023-02-21 00:28:00,BTC/USD,24809.28,24809.28,24809.27,24809.27,0.003961,98.279938
6,1676939220000,2023-02-21 00:27:00,BTC/USD,24809.28,24809.28,24809.28,24809.28,0.0,0.0
7,1676939160000,2023-02-21 00:26:00,BTC/USD,24809.28,24809.28,24809.28,24809.28,0.0,0.0
8,1676939100000,2023-02-21 00:25:00,BTC/USD,24821.31,24821.31,24809.28,24809.28,0.001361,33.758732
9,1676939040000,2023-02-21 00:24:00,BTC/USD,24817.2,24821.31,24811.49,24821.31,0.212014,5262.474899


In [14]:
df_prices['Date'] = pd.to_datetime(df_prices['Date'])

In [15]:
df1_shifted = df.copy()

In [17]:
df1_shifted['minute'] = df1_shifted['minute'].shift(1)

In [23]:
df1_renamed = df1_shifted.rename(columns={'minute': 'Date'})

In [27]:
combined_df = pd.merge(df_prices, df1_renamed, on='Date', how='inner')

In [30]:
combined_df.shape

(229044, 10)

In [29]:
combined_df.to_csv('text_price_comb.csv')