### Importing relevant libraries

In [9]:
sudo apt install mallet


Password:
sudo: a password is required


In [5]:
pip install gensim pandas matplotlib nltk pyLDAvis

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.2 kB)
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading gensim-4.3.3-cp310-cp310-macosx_11_0_arm64.whl (24.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.0/24.0 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading smart_open-7.1.0-py3-none-any.whl (61 kB)
Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, smart-open, gensim, pyLDAvis
Successfully installed funcy-2.0 gensim-4.3.3 pyLDAvis-

In [6]:
wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
unzip mallet-2.0.8.zip

SyntaxError: invalid syntax (3488535892.py, line 1)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from sklearn.metrics import accuracy_score
import gensim
from gensim import corpora
from gensim.models import Word2Vec
from gensim.models import LdaModel

In [None]:
import spacy

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/goutham/nltk_data...


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/goutham/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Reading Data

In [5]:
df = pd.read_parquet('btc_data.parquet')

In [8]:
df = df[:20000]

In [9]:
df.label.value_counts()

 1    12372
-1     7628
Name: label, dtype: int64

### Trying n-grams model using TF-IDF

In [10]:
vectoriser_n_grams = TfidfVectorizer(ngram_range=(2,3))

In [11]:
X = vectoriser_n_grams.fit_transform(df["text"])
y = df["label"] 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Fitting it to a Lasso Model

In [24]:
lasso = Lasso(alpha=0.001)  # L1 penalty
lasso.fit(X_train, y_train)

In [25]:
y_pred = lasso.predict(X_test)

In [26]:
y_pred_class = [1 if pred > 0 else -1 for pred in y_pred]

In [27]:
print(accuracy_score(y_test,y_pred_class))

0.6175


### Finding Coefficients for Lasso Model

In [28]:
ngrams = vectoriser_n_grams.get_feature_names_out()
coef = lasso.coef_

selected_ngrams = [(ngrams[i], coef[i]) for i in range(len(coef)) if coef[i] != 0]
selected_ngrams = sorted(selected_ngrams, key=lambda x: abs(x[1]), reverse=True)

print("Top N-grams selected by Lasso:")
for ng, score in selected_ngrams[:10]:  
    print(f"{ng}: {score:.4f}")

Top N-grams selected by Lasso:
bitcoinmin elonmusk: 0.9246
15 billion: 0.5731
btc bitcoinmin elonmusk: 0.0000


### Trying different alphas using Cross-Validation to find ideal regularisation parameter

In [None]:
from sklearn.linear_model import LassoCV

# Define a range of alpha values to test
alphas = [0.0001, 0.001, 0.01, 0.1]

# Perform cross-validation to find the best alpha
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=42)  # 5-fold cross-validation
lasso_cv.fit(X_train, y_train)

# Get the best alpha
best_alpha = lasso_cv.alpha_
print(f"Best alpha selected: {best_alpha}")

In [None]:
y_new_pred = lasso_cv(X_test)
y_new_pred_class = [1 if pred > 0 else -1 for pred in y_new_pred]

In [None]:
print(accuracy_score(y_test,y_new_pred_class))

### Applying a sentiment Model

In [None]:
sia = SentimentIntensityAnalyzer()
text_sample = df["text"][:1000]  # Selecting first 1000 rows
df["sentence_sentiments"] = text_sample.apply(lambda text: [sia.polarity_scores(sent)['compound'] for sent in sent_tokenize(str(text))])
df["document_sentiment"] = text_sample.apply(lambda text: sia.polarity_scores(str(text))['compound'])
df["average_sentiment"] = df["document_sentiment"].mean()
df["split_sentiment_average"] = df["sentence_sentiments"].apply(lambda scores: sum(scores) / len(scores) if scores else 0)
print(df)

### Fitting a lasso model on the sentiments

In [None]:
X = df[["document_sentiment", "split_sentiment_average"]] 
y = df["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = LassoCV(alphas=[0.001, 0.01, 0.1, 1, 10], cv=5, random_state=42)
lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_test)
y_pred_class = [1 if pred > 0.5 else 0 for pred in y_pred]  # Thresholding at 0.5

accuracy = accuracy_score(y_test, y_pred_class)

### We can also try to analyse the topic that a tweet is about

In [None]:
def preprocess(text):
    tokens = gensim.utils.simple_preprocess(text, deacc=True)  # Tokenize and remove punctuation
    return [word for word in tokens if word not in stop_words]

In [None]:
df["processed_text"] = df["text"].astype(str).apply(preprocess)
dictionary = corpora.Dictionary(df["processed_text"])
corpus = [dictionary.doc2bow(text) for text in df["processed_text"]]

In [None]:
lda_model = LdaModel(mallet_path, corpus=corpus, num_topics=5, id2word=dictionary)

# Get topics
topics = lda_model.show_topics(num_topics=5, formatted=False)

# Print detected topics
for topic_num, topic_words in topics:
    print(f"Topic {topic_num}: {[word[0] for word in topic_words]}")

## The aforementioned code related to the BOW approach. Now we preprocess using various Structural Approaches

In [None]:
word2vec_model = Word2Vec(
    sentences=df["processed_text"], 
    vector_size=100,  
    window=5,  
    min_count=2,  
    sg=1,  
    workers=4,  
    epochs=10  
)

word2vec_model.save("word2vec_model.bin")

vector = word2vec_model.wv["bitcoin"]
print("Bitcoin Embedding:", vector[:10