In [79]:
# General Imports
import pprint 
import re
import zipfile 
import gzip
import shutil
from tqdm.notebook import tqdm
from emoji import demojize
from typing import Union, List

#BERT
from transformers import pipeline

# Load the sentiment analysis pipeline
## Basic
# sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased") 
## Reddit
# sentiment_pipeline = pipeline("text-classification", model="minh21/XLNet-Reddit-Sentiment-Analysis") 
## Financial Sentiment
sentiment_pipeline = pipeline("text-classification", model="Sigma/financial-sentiment-analysis")

# Data Analysis and visualizations
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

# Import GenSim
import gensim
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases, Phraser

# Import Spacy
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

# Import NLTK
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
nltk.download('punkt', quiet=True)
nltk.download('sentiwordnet')
nltk.download('wordnet')

2024-03-19 23:28:49.421954: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 23:28:49.834838: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-19 23:28:49.835039: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-19 23:28:49.917693: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-19 23:28:50.089731: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-19 23:28:50.092041: I tensorflow/core/platform/cpu_feature_guard.cc:1

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[nltk_data] Downloading package sentiwordnet to /home/jon/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
# verify working directory of the notebook 
import os 
print(os.getcwd())
import sys
path = os.path.abspath(os.path.join('../../')) # or the path to your source code
sys.path.insert(0, path)
print(path)

/home/jon/Documents/HEC/HEC/Session 4 Winter 2024/Machine Learning I/Project/WallstreetbetsGenNLP/tests/jonathan
/home/jon/Documents/HEC/HEC/Session 4 Winter 2024/Machine Learning I/Project/WallstreetbetsGenNLP


In [40]:
# Extract the zip files

# Specify the path to the zip file
zip_file_path = path + '/data_raw/reddit_wsb.csv.zip'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('data_raw')

# Read the CSV using Pandas
csv_file_path = 'data_raw/reddit_wsb.csv'
df = pd.read_csv(csv_file_path)

In [104]:
# Defining Functions 
# Estimate sentiment of a single word using SentiWordNet
def word_sentiment(word):
    synsets = list(swn.senti_synsets(word))
    if not synsets:
        return 0
    # Take the first sense, the most common
    return synsets[0].pos_score() - synsets[0].neg_score()

# Compute the sentiment score 
def bert_sentiment_analysis(text):
    # Perform sentiment analysis
    result = sentiment_pipeline(text)
    # The result includes the label ('POSITIVE' or 'NEGATIVE') and a confidence score
    return result

# # Compute the sentiment score (paralelized)
# def batch_sentiment_analysis(texts, batch_size=8):
#     results = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i+batch_size]
#         results.extend(sentiment_pipeline(batch))
#     return results


In [57]:
# Preprocessing functions

def preprocess_text(texts: Union[str, List[str], pd.Series], clean_emojis: bool = False) -> Union[str, List[str]]:
    cleaned_texts = []

    # Processing texts using Spacy pipeline
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts), desc="Cleaning Texts"):

        # Handle emojis: translate to text if not removing, else remove
        if clean_emojis:
            doc = re.sub(r':[^:]+:', '', demojize(doc.text))  # Remove emojis
        else:
            doc = demojize(doc.text)  # Convert emojis to text

        # Tokenization and preprocessing
        tokens = [token.text.lower() for token in nlp(doc) if token.text.isalpha()]

        # Removing stopwords and short tokens
        tokens = [token for token in tokens if token not in stop_words and len(token) > 1]

        cleaned_texts.append(' '.join(tokens))  # Rejoin tokens into a string

    return cleaned_texts


In [42]:
# Fill all the NaN values in the body column with an empty string
df['body'] = df['body'].fillna('')

# Combine the title and bodyy into a single column text, separated by two newlines
df['text'] = df['title'] + '\n\n' + df['body']

# drop the body column 
df = df.drop(columns=['body'])

# Preview the loaded data 
display(df.head(10))


Unnamed: 0,title,score,id,url,comms_num,created,timestamp,text
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,2021-01-28 21:37:41,"It's not about the money, it's about sending a..."
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,2021-01-28 21:32:10,Math Professor Scott Steiner says the numbers ...
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,2021-01-28 21:30:35,Exit the system\n\nThe CEO of NASDAQ pushed to...
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,2021-01-28 21:28:57,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,2021-01-28 21:26:56,"Not to distract from GME, just thought our AMC..."
5,WE BREAKING THROUGH,405,l6uf7d,https://i.redd.it/2wef8tc062e61.png,84,1611862000.0,2021-01-28 21:26:30,WE BREAKING THROUGH\n\n
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,2021-01-28 21:26:27,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE\n\...
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1611862000.0,2021-01-28 21:19:31,THIS IS THE MOMENT\n\nLife isn't fair. My moth...
8,Currently Holding AMC and NOK - Is it retarded...,200,l6ub4i,https://i.redd.it/6k2z7ouo42e61.png,161,1611862000.0,2021-01-28 21:19:16,Currently Holding AMC and NOK - Is it retarded...
9,I have nothing to say but BRUH I am speechless...,291,l6uas9,https://i.redd.it/bfzzw2yo42e61.jpg,27,1611862000.0,2021-01-28 21:18:37,I have nothing to say but BRUH I am speechless...


In [43]:
texts = pd.DataFrame(df['title'])

In [67]:
# Preprocess each title and track progress with tqdm
texts['processed_text'] = preprocess_text(texts['title'], clean_emojis=True)


Cleaning Texts:   0%|          | 0/53187 [00:00<?, ?it/s]

In [68]:
display(texts.head(10))

Unnamed: 0,title,preprocessed,processed_text
0,"It's not about the money, it's about sending a...",money sending message,money sending message
1,Math Professor Scott Steiner says the numbers ...,math professor scott steiner says numbers spel...,math professor scott steiner says numbers spel...
2,Exit the system,exit system,exit system
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,new sec filing gme someone less retarded pleas...,new sec filing gme someone less retarded pleas...
4,"Not to distract from GME, just thought our AMC...",distract gme thought amc brothers aware,distract gme thought amc brothers aware
5,WE BREAKING THROUGH,breaking,breaking
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,short stock expiration date,short stock expiration date
7,THIS IS THE MOMENT,moment,moment
8,Currently Holding AMC and NOK - Is it retarded...,currently holding amc nok retarded think move ...,currently holding amc nok retarded think move ...
9,I have nothing to say but BRUH I am speechless...,nothing say bruh speechless moon,nothing say bruh speechless moon


In [103]:
texts['sentiment'] = [bert_sentiment_analysis(text) for text in tqdm(texts['processed_text'], desc="BERT Sentiment Analysis")]


# Have to paralelize 
# Problems with torch

BERT Sentiment Analysis:   0%|          | 0/53187 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [105]:
import tensorflow as tf

# Check TensorFlow version
print("TensorFlow Version:", tf.__version__)

# List available GPUs in TensorFlow
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            print("GPU:", gpu)
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPUs available: ", len(gpus))
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found.")

# Check if TensorFlow was built with CUDA
print("Built with CUDA:", tf.test.is_built_with_cuda())

# Check if a GPU is available and if TensorFlow can access it
print("GPU available (TensorFlow):", tf.test.is_gpu_available())


TensorFlow Version: 2.15.0
No GPUs found.
Built with CUDA: True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available (TensorFlow): False


2024-03-20 00:07:20.949058: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-20 00:07:21.005671: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-03-20 00:07:21.014617: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/li