In [None]:
#Source: https://scikit-learn.org/stable/modules/feature_extraction.html#vectorizing-a-large-text-corpus-with-the-hashing-trick
# Source: https://stackoverflow.com/questions/17536394/how-can-i-reduce-memory-usage-of-scikit-learn-vectorizers
# Reasoning is that TFIDF Vectorizer create very sparse matrix, which resulted in a 284GB matrix that needed to be held in memory in order to process it.
# Initialize HashingVectorizer with a reasonable number of features
hash_vectorizer = HashingVectorizer(n_features=2**10, alternate_sign=False, norm=None)

# Transform the text data into hashed feature space using the string format
hashed_matrix = hash_vectorizer.transform(filtered_df['filtered_text_str'])

# Apply TfidfTransformer to add IDF weighting
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(hashed_matrix)

# Check the shape and sparsity of the transformed TF-IDF matrix
print(f"Shape of tfidf_matrix: {tfidf_matrix.shape}")
print(f"Number of non-zero elements: {tfidf_matrix.nnz}")
print(f"Sparsity: {(1 - (tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1]))) * 100:.2f}%")


In [None]:
# Compute cosine similarity matrix, which measures the similarity between documents
# Source: https://medium.com/@anurag-jain/tf-idf-vectorization-with-cosine-similarity-eca3386d4423
similarity_matrix = cosine_similarity(tfidf_matrix)


In [None]:
# Use TfidfVectorizer with min_df and ngram_range to reduce dimensionality
vectorizer = TfidfVectorizer(
    lowercase = False,  # Do not convert to lowercase,because it is already converted
    min_df = 2,  # Ignore terms that appear in less than 2 documents
    ngram_range=(1, 2),  # Consider both unigrams and bigrams
    max_features=10000,  # Further limit the size of the vocabulary
)

# Fit and transform the text data
#tfidf_matrix = vectorizer.fit_transform(filtered_df['filtered_text_str'])

In [None]:
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    max_features=10,
    stop_words="english",
)
# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(filtered_df['filtered_text_str'])

In [None]:

# Train a simple logistic regression model for demonstration
clf = LogisticRegression(max_iter=1000)
clf.fit(tfidf_matrix, filtered_df['target'])

# Function to print top informative features for binary classification
def print_top_features(vectorizer, clf, top_n=20):
    """Prints the top n informative features for each class using model coefficients."""
    feature_names = vectorizer.get_feature_names_out()
    coef = clf.coef_[0]  # Use the first (and only) set of coefficients for binary classification
    
    # Top features with the most negative coefficients (indicative of Class 0)
    top_features_class_0 = np.argsort(coef)[:top_n]
    print("\nTop 20 Words Indicative of Class 0 (Stock Price Down):")
    print(" ".join(feature_names[j] for j in top_features_class_0))
    
    # Top features with the most positive coefficients (indicative of Class 1)
    top_features_class_1 = np.argsort(coef)[-top_n:]
    print("\nTop 20 Words Indicative of Class 1 (Stock Price Up):")
    print(" ".join(feature_names[j] for j in top_features_class_1))
    
    return top_features_class_0, top_features_class_1

# Print the most informative features
top_features_class_0, top_features_class_1 = print_top_features(vectorizer, clf)

In [None]:

# Plotting the most indicative words for Class 0
plt.figure(figsize=(12, 6))
plt.barh([vectorizer.get_feature_names_out()[i] for i in top_features_class_0], clf.coef_[0][top_features_class_0], color='blue')
plt.title('Top 20 Words Indicative of Class 0 (Stock Price Down)')
plt.xlabel('Coefficient Value')
plt.gca().invert_yaxis()
plt.show()

In [None]:

# Plotting the most indicative words for Class 1
plt.figure(figsize=(12, 6))
plt.barh([vectorizer.get_feature_names_out()[i] for i in top_features_class_1], clf.coef_[0][top_features_class_1], color='green')
plt.title('Top 20 Words Indicative of Class 1 (Stock Price Up)')
plt.xlabel('Coefficient Value')
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Check the shape of the tfidf_matrix
print(f"Shape of tfidf_matrix: {tfidf_matrix.shape}")

# Number of documents (rows) and terms (columns)
num_documents, num_features = tfidf_matrix.shape
print(f"Number of documents: {num_documents}")
print(f"Number of features (terms): {num_features}")

# Check the number of non-zero elements
non_zero_elements = tfidf_matrix.nnz
print(f"Number of non-zero elements: {non_zero_elements}")

# Calculate the total number of elements
total_elements = num_documents * num_features
print(f"Total number of elements: {total_elements}")

# Calculate the sparsity of the matrix
sparsity = (1 - (non_zero_elements / total_elements)) * 100
print(f"Sparsity of the tfidf_matrix: {sparsity:.2f}%")


In [None]:
# Compute cosine similarity matrix, which measures the similarity between documents
# Source: https://medium.com/@anurag-jain/tf-idf-vectorization-with-cosine-similarity-eca3386d4423
similarity_matrix = cosine_similarity(tfidf_matrix)

# Convert similarity matrix to a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix)

In [None]:
# Function to compute cosine similarity in batches
def compute_similarity_in_batches(tfidf_matrix, batch_size=1000):
    num_docs = tfidf_matrix.shape[0]
    similarity_results = []
    
    # Process the similarity computation in batches
    for start in range(0, num_docs, batch_size):
        end = min(start + batch_size, num_docs)
        print(f"Processing batch: {start} to {end}")
        
        # Compute similarities for the batch
        batch_similarity = cosine_similarity(tfidf_matrix[start:end], tfidf_matrix)
        
        # Convert to a sparse matrix to save memory (optional)
        batch_similarity_sparse = pd.DataFrame(batch_similarity)
        
        # Append the results
        similarity_results.append(batch_similarity_sparse)
    
    # Combine all batches into one DataFrame
    return pd.concat(similarity_results, axis=0)

# Using the function to compute the similarity matrix in smaller chunks
similarity_df = compute_similarity_in_batches(tfidf_matrix, batch_size=500)

# Display the shape to confirm the complete matrix
print(f"Computed similarity matrix shape: {similarity_df.shape}")


In [None]:
# Find indices of the most and least similar documents (excluding self-similarity)
np.fill_diagonal(similarity_matrix, 0)
most_similar_indices = np.unravel_index(np.argmax(similarity_matrix, axis=None), similarity_matrix.shape)
least_similar_indices = np.unravel_index(np.argmin(similarity_matrix, axis=None), similarity_matrix.shape)

In [None]:

# Inspect the most similar documents
similar_doc_1 = news_df.iloc[most_similar_indices[0]]
similar_doc_2 = news_df.iloc[most_similar_indices[1]]

print("\nMost Similar Documents:\n")
print("Document 1:")
print(similar_doc_1['content'])
print("\nDocument 2:")
print(similar_doc_2['content'])

In [None]:

# Inspect the least similar documents
dissimilar_doc_1 = news_df.iloc[least_similar_indices[0]]
dissimilar_doc_2 = news_df.iloc[least_similar_indices[1]]

print("\n\nMost Dissimilar Documents:\n")
print("Document 1:")
print(dissimilar_doc_1['content'])
print("\nDocument 2:")
print(dissimilar_doc_2['content'])



## Word2Vec Embeddings

In [None]:
# Prepare the text data for Word2Vec
sentences = filtered_df['filtered_text'].tolist()
# Initializing Word2Vec model 
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=0)  # sg=0 for CBOW

In [None]:
# Training the model
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

# Saving the model for future use
word2vec_model.save("word2vec_model.bin")

In [None]:

# Getting word vectors for a particular word
word_vector = word2vec_model.wv['nvidia']

# Converting a document to an embedding by averaging word vectors
def get_document_embedding(doc):
    return np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)

# Applying to the dataset
filtered_df['doc_embedding'] = filtered_df['filtered_text'].apply(get_document_embedding)


## Train a simple logistic regression model for demonstration

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Feature Matrix and Target Variable
X = np.vstack(filtered_df['doc_embedding'].values)
y = filtered_df['target']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

# Optional: Apply scaling to improve model performance (though it's not always necessary for TF-IDF)
# Initialize the scaler and create a pipeline with scaling and the classifier
scaler = StandardScaler(with_mean=False)  # with_mean=False because TF-IDF produces sparse matrices

# Example with Logistic Regression
lr_pipeline = make_pipeline(scaler, LogisticRegression(max_iter=1000))

# Train the model
lr_pipeline.fit(X_train, y_train)

# Evaluate on test data
test_score = lr_pipeline.score(X_test, y_test)

print(f"Test accuracy of Logistic Regression: {test_score:.2f}")

# Optionally, use cross-validation to validate the model across multiple folds
cross_val_scores = cross_val_score(lr_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy: {np.mean(cross_val_scores):.2f} ± {np.std(cross_val_scores):.2f}")


## Train a simple logistic regression model for demonstration

In [None]:
#from sklearn.naive_bayes import MultinomialNB ( Multinomial can't handle negative values)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Use the document embeddings (X_train should be the embedding matrix)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


## Evaluation Metrics

In [None]:
# Predictions for both models
gnb_pred = gnb_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

# Evaluation metrics
accuracy_nb = accuracy_score(y_test, gnb_pred)
accuracy_lr = accuracy_score(y_test, lr_pred)

f1_nb = f1_score(y_test, gnb_pred)
f1_lr = f1_score(y_test, lr_pred)

print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}, F1 Score: {f1_nb:.2f}")
print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}, F1 Score: {f1_lr:.2f}")

# Confusion Matrix
cm_nb = confusion_matrix(y_test, gnb_pred)
cm_lr = confusion_matrix(y_test, lr_pred)

print("\nNaive Bayes Confusion Matrix:")
print(cm_nb)

print("\nLogistic Regression Confusion Matrix:")
print(cm_lr)


In [None]:
# Initializing Word2Vec model 
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4, sg=1)  # sg=1 for Skip-Gram

In [None]:
# Training the model
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

# Saving the model for future use
word2vec_model.save("word2vec_model_skipgram.bin")

In [None]:
# Getting word vectors for a particular word
word_vector_2 = word2vec_model.wv['nvidia']

# Converting a document to an embedding by averaging word vectors
def get_document_embedding(doc):
    return np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)

# Applying to the dataset
filtered_df['doc_embedding'] = filtered_df['filtered_text'].apply(get_document_embedding)

In [None]:
# Apply the function to all documents in filtered dataframe
filtered_df['doc_embedding_skip'] = filtered_df['filtered_text'].apply(
    lambda doc: document_embedding_tfidf(word2vec_model, doc, vectorizer, tfidf_feature_names)
)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Prepare your feature matrix and target
X = filtered_df['doc_embedding'].tolist()  # Assuming you're using document embeddings
X = np.array(X)  # Convert list of embeddings to numpy array
y = filtered_df['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17, stratify=y)

# Define the parameter grid for var_smoothing
param_grid = {'var_smoothing': np.logspace(0, -9, num=100)}

# Initialize the GaussianNB model
gnb_model = GaussianNB()

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(gnb_model, param_grid, cv=5, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Accuracy: {best_score:.4f}")

# Evaluate the best model on the test set
best_gnb_model = grid_search.best_estimator_
y_pred = best_gnb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {test_accuracy:.4f}")


In [None]:
#from sklearn.naive_bayes import MultinomialNB ( Multinomial can't handle negative values)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# Use the document embeddings (X_train should be the embedding matrix)
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


In [None]:
# Predictions for both models
gnb_pred = gnb_model.predict(X_test)
lr_pred = lr_model.predict(X_test)

# Evaluation metrics
accuracy_nb = accuracy_score(y_test, gnb_pred)
accuracy_lr = accuracy_score(y_test, lr_pred)

f1_nb = f1_score(y_test, gnb_pred)
f1_lr = f1_score(y_test, lr_pred)

print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}, F1 Score: {f1_nb:.2f}")
print(f"Logistic Regression Accuracy: {accuracy_lr:.2f}, F1 Score: {f1_lr:.2f}")

# Confusion Matrix
cm_nb = confusion_matrix(y_test, gnb_pred)
cm_lr = confusion_matrix(y_test, lr_pred)

print("\nNaive Bayes Confusion Matrix:")
print(cm_nb)

print("\nLogistic Regression Confusion Matrix:")
print(cm_lr)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Step 1: Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Step 2: Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)

plt.title('Confusion Matrix')
plt.show()