In [1]:
import pandas as pd
import numpy as np

data= pd.read_csv('stock.csv')
print(data.head())


                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/amy-cooper-lose...   
3  https://www.huffpost.com/entry/belk-worker-fou...   
4  https://www.huffpost.com/entry/reporter-gets-a...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   
3  Cleaner Was Dead In Belk Bathroom For 4 Days B...  U.S. NEWS   
4  Reporter Gets Adorable Surprise From Her Boyfr...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  Amy Cooper accused inve

In [2]:
data['stock_trend']=(data['Close_^GSPC'] > data['Open_^GSPC']).astype(int)

In [3]:
print(data['stock_trend'].value_counts())


stock_trend
1    934
0    825
Name: count, dtype: int64


In [4]:
data['date'] = pd.to_datetime(data['date'])
data['day_of_week'] = data['date'].dt.dayofweek  # 0=Monday, 6=Sunday
data['month'] = data['date'].dt.month

In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Step 1: Initialize VADER
analyzer = SentimentIntensityAnalyzer()

# Step 2: Combine 'headline' and 'short_description' into one text column
data['combined_text'] = data['headline'].fillna('') + ' ' + data['short_description'].fillna('')

# Step 3: Define a function to extract compound sentiment score
def get_vader_sentiment(text):
    return analyzer.polarity_scores(text)['compound']  # Value between -1 (neg) to +1 (pos)

# Step 4: Apply sentiment analysis
data['sentiment_score'] = data['combined_text'].apply(get_vader_sentiment)

# Optional: Preview the result
print(data[['combined_text', 'sentiment_score']].head())

                                       combined_text  sentiment_score
0  Over 4 Million Americans Roll Up Sleeves For O...          -0.1280
1  American Airlines Flyer Charged, Banned For Li...          -0.7269
2  Woman Who Called Cops On Black Bird-Watcher Lo...          -0.8957
3  Cleaner Was Dead In Belk Bathroom For 4 Days B...          -0.8779
4  Reporter Gets Adorable Surprise From Her Boyfr...           0.6486


In [6]:
print(data.head())

                                                link  \
0  https://www.huffpost.com/entry/covid-boosters-...   
1  https://www.huffpost.com/entry/american-airlin...   
2  https://www.huffpost.com/entry/amy-cooper-lose...   
3  https://www.huffpost.com/entry/belk-worker-fou...   
4  https://www.huffpost.com/entry/reporter-gets-a...   

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   
3  Cleaner Was Dead In Belk Bathroom For 4 Days B...  U.S. NEWS   
4  Reporter Gets Adorable Surprise From Her Boyfr...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  Amy Cooper accused inve

In [7]:
data['text_length'] = data['combined_text'].apply(len)

# (Optional) Text length in words
data['word_count'] = data['combined_text'].apply(lambda x: len(x.split()))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer (you can tune max_features or ngram_range)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform combined_text to get TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(data['combined_text'])

# tfidf_matrix is a sparse matrix with TF-IDF scores for each word (feature)

# Optional: Convert sparse matrix to DataFrame for easier use
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Combine TF-IDF features with your original DataFrame
data = pd.concat([data.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


In [9]:
data.to_csv("stock(updated).csv", index=False)