In [1]:
# General Imports
import pprint 
import re
import zipfile 
import gzip
import shutil
from tqdm.notebook import tqdm
from emoji import demojize
from typing import Union, List

#BERT
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased")

# Data Analysis and visualizations
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

# Import GenSim
import gensim
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases, Phraser

# Import Spacy
import spacy
from spacy.lang.en import English
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])

# Import NLTK
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
nltk.download('punkt', quiet=True)
nltk.download('sentiwordnet')
nltk.download('wordnet')




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\jairp\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jairp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# # verify working directory of the notebook 
# import os 
# print(os.getcwd())
# import sys
# path = os.path.abspath(os.path.join('../../')) # or the path to your source code
# sys.path.insert(0, path)
# print(path)

In [3]:
# Extract the zip files

# Specify the path to the zip file
zip_file_path = 'data_raw/reddit_wsb.csv.zip'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('data_raw')

# Read the CSV using Pandas
csv_file_path = 'data_raw/reddit_wsb.csv'
df = pd.read_csv(csv_file_path)

In [4]:
def word_sentiment(word):
    """
    Estimate the sentiment of a single word using SentiWordNet.

    Args:
        word (str): The word to estimate the sentiment for.

    Returns:
        float: The sentiment score of the word.
    """
    # Get the list of SentiWordNet synsets for the word
    synsets = list(swn.senti_synsets(word))
    
    if not synsets:
        return 0
    
    # Take the first sense, the most common
    return synsets[0].pos_score() - synsets[0].neg_score()

# Compute the sentiment score 
def bert_sentiment_analysis(text):
    """
    Perform sentiment analysis using BERT model.

    Args:
        text (str): The input text for sentiment analysis.

    Returns:
        dict: A dictionary containing the sentiment label ('POSITIVE' or 'NEGATIVE') and a confidence score.

    """
    # Perform sentiment analysis
    result = sentiment_pipeline(text)
    
    # The result includes the label ('POSITIVE' or 'NEGATIVE') and a confidence score
    return result

# # Compute the sentiment score (paralelized)
# def batch_sentiment_analysis(texts, batch_size=8):
#     results = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i+batch_size]
#         results.extend(sentiment_pipeline(batch))
#     return results


In [5]:
# Preprocessing functions

def preprocess_text(texts: Union[str, List[str], pd.Series], clean_emojis: bool = False) -> Union[str, List[str]]:
    """
    Preprocesses a list of texts by cleaning them and removing stopwords.

    Args:
        texts (Union[str, List[str], pd.Series]): The input texts to preprocess.
        clean_emojis (bool, optional): Whether to clean emojis or convert them to text. Defaults to False.

    Returns:
        Union[str, List[str]]: The preprocessed texts.
    """
    cleaned_texts = []

    # Processing texts using Spacy pipeline
    for doc in tqdm(nlp.pipe(texts, batch_size=20), total=len(texts), desc="Cleaning Texts"):

        # Handle emojis: translate to text if not removing, else remove
        if clean_emojis:
            doc = re.sub(r':[^:]+:', '', demojize(doc.text))  # Remove emojis
        else:
            doc = demojize(doc.text)  # Convert emojis to text

        # Tokenization and preprocessing
        tokens = [token.text.lower() for token in nlp(doc) if token.text.isalpha()]

        # Removing stopwords and short tokens
        tokens = [token for token in tokens if token not in stop_words and len(token) > 1]

        cleaned_texts.append(' '.join(tokens))  # Rejoin tokens into a string

    return cleaned_texts


In [6]:
# Fill all the NaN values in the body column with an empty string
df['body'] = df['body'].fillna('')

# Combine the title and bodyy into a single column text, separated by two newlines
df['text'] = df['title'] + '\n\n' + df['body']

# drop the body column 
df = df.drop(columns=['body'])

# Preview the loaded data 
display(df.head(10))


Unnamed: 0,title,score,id,url,comms_num,created,timestamp,text
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,2021-01-28 21:37:41,"It's not about the money, it's about sending a..."
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,2021-01-28 21:32:10,Math Professor Scott Steiner says the numbers ...
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,2021-01-28 21:30:35,Exit the system\n\nThe CEO of NASDAQ pushed to...
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,2021-01-28 21:28:57,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,2021-01-28 21:26:56,"Not to distract from GME, just thought our AMC..."
5,WE BREAKING THROUGH,405,l6uf7d,https://i.redd.it/2wef8tc062e61.png,84,1611862000.0,2021-01-28 21:26:30,WE BREAKING THROUGH\n\n
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,317,l6uf6d,https://www.reddit.com/r/wallstreetbets/commen...,53,1611862000.0,2021-01-28 21:26:27,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE\n\...
7,THIS IS THE MOMENT,405,l6ub9l,https://www.reddit.com/r/wallstreetbets/commen...,178,1611862000.0,2021-01-28 21:19:31,THIS IS THE MOMENT\n\nLife isn't fair. My moth...
8,Currently Holding AMC and NOK - Is it retarded...,200,l6ub4i,https://i.redd.it/6k2z7ouo42e61.png,161,1611862000.0,2021-01-28 21:19:16,Currently Holding AMC and NOK - Is it retarded...
9,I have nothing to say but BRUH I am speechless...,291,l6uas9,https://i.redd.it/bfzzw2yo42e61.jpg,27,1611862000.0,2021-01-28 21:18:37,I have nothing to say but BRUH I am speechless...


In [7]:
texts = pd.DataFrame(df['title'])

In [8]:
# Preprocess each title and track progress with tqdm
texts['processed_text'] = preprocess_text(texts['title'], clean_emojis=True)


Cleaning Texts:   0%|          | 0/53187 [00:00<?, ?it/s]

In [9]:
display(texts.head(10))

Unnamed: 0,title,processed_text
0,"It's not about the money, it's about sending a...",money sending message
1,Math Professor Scott Steiner says the numbers ...,math professor scott steiner says numbers spel...
2,Exit the system,exit system
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,new sec filing gme someone less retarded pleas...
4,"Not to distract from GME, just thought our AMC...",distract gme thought amc brothers aware
5,WE BREAKING THROUGH,breaking
6,SHORT STOCK DOESN'T HAVE AN EXPIRATION DATE,short stock expiration date
7,THIS IS THE MOMENT,moment
8,Currently Holding AMC and NOK - Is it retarded...,currently holding amc nok retarded think move ...
9,I have nothing to say but BRUH I am speechless...,nothing say bruh speechless moon


In [12]:
# Subset a number of texts 
df_test = texts.iloc[0:1000, :]

In [28]:
# Subset a number of texts 
df_test = texts.iloc[0:10000, :]

# # Apply sentiment analysis using BERT model
# df_test.loc[:, 'sentiment'] = sentiment_pipeline(df_test['processed_text'].tolist())

import os
import concurrent.futures

# Get the number of available cores
num_cores = os.cpu_count()
max_workers = num_cores - 1

# Split df_test into 4 batches
num_batches = 8
batch_size = len(df_test) // num_batches
batches = [df_test[i*batch_size:(i+1)*batch_size] for i in range(num_batches)]

# Function to process each batch independently
def process_batch(batch):
    batch['sentiment'] = sentiment_pipeline(batch['processed_text'].tolist())
    return batch

# Process each batch in parallel using threads
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = executor.map(process_batch, batches)

# Combine the results from all batches
df_test = pd.concat(results)

# Extract the sentiment label and confidence score
df_test['sentiment_label'] = df_test['sentiment'].apply(lambda x: x['label'])
df_test['sentiment_score'] = df_test['sentiment'].apply(lambda x: x['score'])

# Transform the sentiment label to a numerical value
df_test['sentiment_label'] = df_test['sentiment_label'].map({'LABEL_1': 1, 'LABEL_0': 0})

# Drop original sentiment column if it exists 
if 'sentiment' in df_test.columns:
    df_test = df_test.drop(columns=['sentiment'])

# Display
df_test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['sentiment'] = sentiment_pipeline(batch['processed_text'].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['sentiment'] = sentiment_pipeline(batch['processed_text'].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch['sentiment'] = sentiment_pipeline(batch['processed_

Unnamed: 0,title,processed_text,sentiment_label,sentiment_score
0,"It's not about the money, it's about sending a...",money sending message,1,0.542390
1,Math Professor Scott Steiner says the numbers ...,math professor scott steiner says numbers spel...,1,0.530862
2,Exit the system,exit system,1,0.527198
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,new sec filing gme someone less retarded pleas...,1,0.528212
4,"Not to distract from GME, just thought our AMC...",distract gme thought amc brothers aware,1,0.526397
...,...,...,...,...
9995,Looks like we can buy on SoFi*,looks like buy sofi,1,0.530242
9996,Can't cancel my Revolut order,ca cancel revolut order,1,0.528675
9997,LOL they are trying so hard to break us!! But ...,lol trying hard break us cheap buying,1,0.525004
9998,Opening TD Ameritrade. Anyone got a referral c...,opening td ameritrade anyone got referral code,1,0.521101


In [105]:
import tensorflow as tf

# Check TensorFlow version
print("TensorFlow Version:", tf.__version__)

# List available GPUs in TensorFlow
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            print("GPU:", gpu)
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPUs available: ", len(gpus))
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found.")

# Check if TensorFlow was built with CUDA
print("Built with CUDA:", tf.test.is_built_with_cuda())

# Check if a GPU is available and if TensorFlow can access it
print("GPU available (TensorFlow):", tf.test.is_gpu_available())


TensorFlow Version: 2.15.0
No GPUs found.
Built with CUDA: True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU available (TensorFlow): False


2024-03-20 00:07:20.949058: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-20 00:07:21.005671: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-03-20 00:07:21.014617: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/li