In [8]:
!pip install scikit-learn nltk spacy



In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df = pd.read_csv('/content/raw_analyst_ratings.csv', parse_dates=['date'])

In [11]:
# Ensure stopwords is defined
stopwords = nltk.corpus.stopwords.words('english')

# Re-run the relevant parts of the code block from the user's input
headlines = df['headline'].tolist()

# Preprocessing function (assuming this is already defined above)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords]
    return ' '.join(tokens)

processed_headlines = [preprocess_text(headline) for headline in headlines]

# Use TF-IDF to find important terms (can also use CountVectorizer)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_headlines)

# Get feature names (the words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# To identify common phrases, N-grams can be used.
ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 3), max_df=0.95, min_df=5) # Consider bigrams and trigrams
ngram_matrix = ngram_vectorizer.fit_transform(processed_headlines)
ngram_feature_names = ngram_vectorizer.get_feature_names_out()

# Function to display top N terms/phrases from a matrix
def display_top_terms(vectorizer, feature_matrix, n=10):
    print(f"\nTop {n} common terms/phrases:")
    sum_tfidf = feature_matrix.sum(axis=0)
    sorted_indices = np.argsort(sum_tfidf.A1)[::-1]
    feature_names = vectorizer.get_feature_names_out()

    for i in range(n):
        term_index = sorted_indices[i]
        print(f"{feature_names[term_index]}")

# Display top common terms
display_top_terms(tfidf_vectorizer, tfidf_matrix, n=20)

# Display top common phrases (bigrams and trigrams)
display_top_terms(ngram_vectorizer, ngram_matrix, n=20)


Top 20 common terms/phrases:
stocks
top
vs
benzingas
earnings
est
market
eps
downgrades
52week
upgrades
shares
reports
update
session
pt
moving
buy
premarket
scheduled

Top 20 common terms/phrases:
benzingas top
stocks moving
earnings scheduled
top upgrades
benzingas top upgrades
stocks hit
hit 52week
stocks hit 52week
price target
biggest movers
52week highs
premarket session
52week lows
movers yesterday
biggest movers yesterday
market update
morning market
raises pt
top downgrades
benzingas top downgrades
