In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec

In [3]:

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\georg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:

# Load the datasets
news_df = pd.read_csv('./data/us_equities_news_dataset.csv')
stock_df = pd.read_csv('./data/NVDA.csv')

In [5]:
# Training on the Entire Corpus: This will extract a broader set of word embeddings that capture general language patterns and semantics. This way it may result in better generalization.

# Sample keywords related to NVIDIA and associated companies

# https://www.nvidia.com/en-us/self-driving-cars/partners/nio/ (Nio partnership with NVIDIA)

# https://nvidianews.nvidia.com/news/uber-selects-nvidia-technology-to-power-its-self-driving-fleets (Uber partnership with NVIDIA)

# https://nvidianews.nvidia.com/news/nvidia-teams-with-amazon-web-services-to-bring-ai-to-millions-of-connected-devices (Amazon partnership with NVIDIA,March 2019 article)

# https://www.anandtech.com/show/15146/new-nvidia-gpu-variant-at-supercomputing-2019 (NVIDIA GPU made for TESLA in 2019)

nvidia_keywords = [
    'NVDA', 'NVIDIA', 'NIO', 'UBER', 'AMZN', 'AMAZON', 'TESLA', 'GPU', 'GRAPHICS',
    'CHIP', 'SEMICONDUCTOR', 'DRIVING', 'DEEP LEARNING']

# Compile a regex pattern from the keywords list
nvidia_pattern = '|'.join(nvidia_keywords)  # Combines the keywords into a regex pattern

# Filter articles where the content or ticker column contains any of the keywords
nvidia_related_articles = news_df[
    news_df['content'].str.contains(nvidia_pattern, case=False, na=False) |
    news_df['ticker'].str.contains(nvidia_pattern, case=False, na=False)
]

# Display the count of NVIDIA-related articles
print(f"\nTotal NVIDIA-related articles found: {nvidia_related_articles.shape[0]}")

# Convert the date columns to datetime format for matching
nvidia_related_articles['Date'] = pd.to_datetime(nvidia_related_articles['release_date'])
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

# Merge filtered news data with stock prices based on publication date
merged_df = pd.merge(nvidia_related_articles, stock_df, on='Date', how='inner')

# Filter to keep only articles that have matching stock data
nvidia_df = merged_df[['content', 'Open', 'Close', 'Date']]

# Label the target variable based on the opening and closing prices
nvidia_df['target'] = np.where(nvidia_df['Open'] > nvidia_df['Close'], 0, 1)

# Display the first few rows to verify the merging and labeling
print("\nFiltered and Labeled Data (NVIDIA-Related):")
print(nvidia_df.head())



Total NVIDIA-related articles found: 81038

Filtered and Labeled Data (NVIDIA-Related):
                                             content     Open    Close  \
0  What s happening\nShares of Chinese electric c...  6.19475  6.13925   
1  Gainers  NIO  NYSE NIO   14   Village Farms In...  6.19475  6.13925   
2  Cemtrex  NASDAQ CETX   85  after FY results \n...  6.19475  6.13925   
3  aTyr Pharma  NASDAQ LIFE   63  on Kyorin Pharm...  5.80800  5.92650   
4  Gainers  NIO  NYSE NIO   14   Meritor  NYSE MT...  5.77250  5.88250   

        Date  target  
0 2020-01-15       0  
1 2020-01-15       0  
2 2020-01-15       0  
3 2020-01-06       1  
4 2019-12-31       1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nvidia_related_articles['Date'] = pd.to_datetime(nvidia_related_articles['release_date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nvidia_df['target'] = np.where(nvidia_df['Open'] > nvidia_df['Close'], 0, 1)


In [6]:
# Check for duplicate documents based on the 'content' column
duplicate_docs = nvidia_df[nvidia_df['content'].duplicated(keep=False)]

# Display the duplicate documents (if any)
print(f"Number of duplicate documents found: {duplicate_docs.shape[0]}")
if not duplicate_docs.empty:
    print("Duplicate Documents:")
    print(duplicate_docs[['content']])

# Remove duplicate documents, keeping the first occurrence
nvidia_df = nvidia_df.drop_duplicates(subset='content', keep='first').reset_index(drop=True)

# Display the updated DataFrame
print(f"Number of documents after removing duplicates: {nvidia_df.shape[0]}")


Number of duplicate documents found: 173
Duplicate Documents:
                                                 content
309    Shares of Uber Technologies Inc  \r\n\r\n     ...
312    Shares of Uber Technologies Inc  \r\n\r\n     ...
625     Bloomberg     You can soon save for your next...
640     Bloomberg     You can soon save for your next...
5369   Micron  MU  closed the most recent trading day...
...                                                  ...
68596   Bloomberg     Being an emerging market econom...
69812   Bloomberg     A decorated U S Army officer wh...
69813   Bloomberg     A decorated U S Army officer wh...
70685   Bloomberg     The next U S  jobs report is mo...
70686   Bloomberg     The next U S  jobs report is mo...

[173 rows x 1 columns]
Number of documents after removing duplicates: 71420


In [7]:
# Basic descriptive statistics for the news dataset
num_articles = nvidia_df.shape[0]
average_words_per_article = nvidia_df['content'].apply(lambda x: len(str(x).split())).mean()
print(f'The number of articles before filtering is: {news_df.shape[0]}')
print(f"The number of articles after filtering is: {num_articles}")
print(f"The average amount of words per article is: {average_words_per_article}")



# Initialize the stemmer
stemmer = PorterStemmer()


# Tokenize content for word frequency analysis
nvidia_df.loc[:, 'text_length'] = nvidia_df['content'].apply(lambda x: len(str(x).split()))


# Tokenize content for word frequency analysis
nvidia_df.loc[:, 'processed_text'] = nvidia_df['content'].apply(lambda x: word_tokenize(str(x).lower()))


# Remove stopwords and punctuation for better NLP insights
stop_words = set(stopwords.words('english'))
# Remove stopwords, punctuation, and apply stemming
nvidia_df.loc[:, 'filtered_text'] = nvidia_df['processed_text'].apply(
    lambda words: [stemmer.stem(word) for word in words if word.isalpha() and word not in stop_words]
)


# Join the filtered words back into strings for TF-IDF
nvidia_df.loc[:, 'filtered_text_str'] = nvidia_df['filtered_text'].apply(lambda x: ' '.join(x))

The number of articles before filtering is: 221513
The number of articles after filtering is: 71420
The average amount of words per article is: 725.113329599552


In [8]:
# TF-IDF Representation of Documents using the processed and filtered text
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)  # Reduced max features
tfidf_matrix = vectorizer.fit_transform(nvidia_df['filtered_text_str'])

In [9]:
# Prepare the text data for Word2Vec
sentences = nvidia_df['filtered_text'].tolist()

In [11]:
def document_embedding_tfidf(model, document, tfidf, feature_names, vector_size=100):
    """
    Compute a TF-IDF weighted document embedding by averaging the Word2Vec embeddings of words in the document.

    Parameters:
    - model: Trained Word2Vec model
    - document: List of tokenized words
    - tfidf: TfidfVectorizer object used for the document
    - feature_names: The list of features (terms) from the TF-IDF vectorizer
    - vector_size: Size of the word embeddings in the Word2Vec model

    Returns:
    - doc_embedding: The TF-IDF weighted document embedding as a numpy array
    """
    word_vectors = []
    weights = []

    # Extract the TF-IDF scores for the document
    tfidf_scores = tfidf.transform([" ".join(document)])
    tfidf_scores = tfidf_scores.toarray().flatten()

    feature_names = feature_names.tolist()  # Convert NumPy array to list

    for word in document:
        if word in model.wv.index_to_key and word in feature_names:
            # Get the word embedding
            word_vector = model.wv[word]
            word_index = feature_names.index(word)  # Get the index of the word in the TF-IDF feature list

            # Get the TF-IDF weight for this word
            tfidf_weight = tfidf_scores[word_index]

            # Collect the word vector and weight
            word_vectors.append(word_vector)
            weights.append(tfidf_weight)

    if len(word_vectors) == 0:
        # Return a zero vector if no words from the document are in the Word2Vec model
        return np.zeros(vector_size)

    # Convert lists to arrays
    word_vectors = np.array(word_vectors)
    weights = np.array(weights)

    # Compute the weighted average of word vectors
    doc_embedding = np.average(word_vectors, axis=0, weights=weights)

    return doc_embedding


tfidf_feature_names = vectorizer.get_feature_names_out()
# Apply the function to all documents in filtered dataframe
nvidia_df['doc_embedding'] = nvidia_df['filtered_text'].apply(
    lambda doc: document_embedding_tfidf(word2vec_model, doc, vectorizer, tfidf_feature_names)
)


In [12]:
# Define the grid of parameters to test
'''
vector_sizes = [100, 200, 300]
window_sizes = [3, 5, 7]
min_counts = [1, 5, 10]
sg_values = [0, 1]  # 0 for CBOW, 1 for Skip-gram
epochs = [10, 20]
'''

'\nvector_sizes = [100, 200, 300]\nwindow_sizes = [3, 5, 7]\nmin_counts = [1, 5, 10]\nsg_values = [0, 1]  # 0 for CBOW, 1 for Skip-gram\nepochs = [10, 20]\n'

In [13]:
from gensim.models import Word2Vec
import itertools


# Define the grid of parameters to test
vector_sizes = [200]
window_sizes = [3]
min_counts = [10]
sg_values = [0, 1]  # 0 for CBOW, 1 for Skip-gram
epochs = [10, 20]

# Iterate over all combinations of parameters
best_score = 0
best_params = None
best_model = None

for vector_size, window, min_count, sg, epoch in itertools.product(vector_sizes, window_sizes, min_counts, sg_values, epochs):
    # Initialize Word2Vec with current hyperparameters
    word2vec_model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=4, sg=sg)
    
    # Train the model
    word2vec_model.train(sentences, total_examples=len(sentences), epochs=epoch)
    
    
    # Convert sentences to document embeddings
    doc_embeddings = [np.mean([word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv], axis=0) for sentence in sentences]
    
    #Checking empty embeddings
    X = np.array([embedding for embedding in doc_embeddings if embedding is not None and len(embedding) > 0])
    y = nvidia_df['target']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)
    
    # Train a simple classifier (e.g., Logistic Regression) on the embeddings
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    # Get the accuracy on the test set
    accuracy = lr_model.score(X_test, y_test)
    
    print(f"Vector Size: {vector_size}, Window: {window}, Min Count: {min_count}, SG: {sg}, Epochs: {epoch}, Accuracy: {accuracy:.4f}")
    
    # Track the best model
    if accuracy > best_score:
        best_score = accuracy
        best_params = (vector_size, window, min_count, sg, epoch)
        best_model = word2vec_model

print(f"Best Model Parameters: Vector Size: {best_params[0]}, Window: {best_params[1]}, Min Count: {best_params[2]}, SG: {best_params[3]}, Epochs: {best_params[4]}")
print(f"Best Model Accuracy: {best_score:.4f}")


Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 10, Accuracy: 0.5316
Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 20, Accuracy: 0.5361
Vector Size: 200, Window: 3, Min Count: 10, SG: 1, Epochs: 10, Accuracy: 0.5296
Vector Size: 200, Window: 3, Min Count: 10, SG: 1, Epochs: 20, Accuracy: 0.5273
Best Model Parameters: Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 20
Best Model Accuracy: 0.5361


In [14]:
from gensim.models import Word2Vec
import itertools
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score


sentences = nvidia_df['filtered_text'].tolist()

# Define the grid of parameters to test
vector_sizes = [200]
window_sizes = [3]
min_counts = [10]
sg_values = [0, 1]  # 0 for CBOW, 1 for Skip-gram
epochs = [10, 20]

# Iterate over all combinations of parameters
best_score = 0
best_params = None
best_model = None

for vector_size, window, min_count, sg, epoch in itertools.product(vector_sizes, window_sizes, min_counts, sg_values, epochs):
    # Initialize Word2Vec with current hyperparameters
    word2vec_model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=4, sg=sg)
    
    # Train the model
    word2vec_model.train(sentences, total_examples=len(sentences), epochs=epoch)
    
    # Convert sentences to document embeddings by averaging the word vectors
    doc_embeddings = [np.mean([word2vec_model.wv[word] for word in sentence if word in word2vec_model.wv], axis=0) for sentence in sentences]
    
    # Checking empty embeddings
    X = np.array([embedding for embedding in doc_embeddings if embedding is not None and len(embedding) > 0])
    y = nvidia_df['target']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)
    
    # Initialize and train Naive Bayes on the document embeddings
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = nb_model.predict(X_test)
    
    # Evaluate the model using accuracy and F1-score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Vector Size: {vector_size}, Window: {window}, Min Count: {min_count}, SG: {sg}, Epochs: {epoch}, Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")
    
    # Track the best model based on accuracy or F1-score
    if accuracy > best_score:
        best_score = accuracy
        best_params = (vector_size, window, min_count, sg, epoch)
        best_model = word2vec_model

print(f"Best Model Parameters: Vector Size: {best_params[0]}, Window: {best_params[1]}, Min Count: {best_params[2]}, SG: {best_params[3]}, Epochs: {best_params[4]}")
print(f"Best Model Accuracy: {best_score:.4f}")


Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 10, Accuracy: 0.5144, F1 Score: 0.4946
Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 20, Accuracy: 0.5147, F1 Score: 0.4972
Vector Size: 200, Window: 3, Min Count: 10, SG: 1, Epochs: 10, Accuracy: 0.5123, F1 Score: 0.5111
Vector Size: 200, Window: 3, Min Count: 10, SG: 1, Epochs: 20, Accuracy: 0.5138, F1 Score: 0.5123
Best Model Parameters: Vector Size: 200, Window: 3, Min Count: 10, SG: 0, Epochs: 20
Best Model Accuracy: 0.5147
