In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime

# Define stock symbol and date range
stock_symbol = 'AAPL'  # Example: Apple stock
start_date = '2022-10-01'
end_date = datetime.today().strftime('%Y-%m-%d')

# Download stock data
stock_data = yf.download(stock_symbol, start=start_date, end=end_date)
stock_data = stock_data[['Close']].reset_index()  # Keep only 'Date' and 'Close'

print(stock_data.head())
print(stock_data.tail())

[*********************100%***********************]  1 of 1 completed

Price        Date       Close
Ticker                   AAPL
0      2022-10-03  142.449997
1      2022-10-04  146.100006
2      2022-10-05  146.399994
3      2022-10-06  145.429993
4      2022-10-07  140.089996
Price        Date       Close
Ticker                   AAPL
539    2024-11-22  229.869995
540    2024-11-25  232.869995
541    2024-11-26  235.059998
542    2024-11-27  234.929993
543    2024-11-29  237.330002





In [None]:
# "1e78027b-d07c-4e35-9a0a-8f1d2b4e5549"    # "42650c85-dd10-4ccd-b974-a1832d8902ec"

In [2]:
import requests
from datetime import datetime, timedelta
import time  # Import the time module for sleep function

api_key = 'f3e342dc-477b-4784-bba2-a0916569947b'
base_url = 'https://content.guardianapis.com/search'
headers = {'Content-Type': 'application/json'}

def get_news_data(start_date, end_date):
    news_data = []
    current_date = datetime.strptime(start_date, '%Y-%m-%d')

    while current_date <= datetime.strptime(end_date, '%Y-%m-%d'):
        date_str = current_date.strftime('%Y-%m-%d')
        print(f"Fetching data for: {date_str}")
        
        params = {
            'section': 'business',
            'page-size': 200,
            'from-date': date_str,
            'to-date': date_str,
            'show-fields': 'body',
            'api-key': api_key,
        }
        response = requests.get(base_url, params=params, headers=headers)
        if response.status_code == 200:
            results = response.json().get("response", {}).get("results", [])
            for article in results:
                if "fields" in article and "body" in article["fields"]:
                    news_data.append({'date': date_str, 'content': article["fields"]["body"]})
        else:
            print(f"Failed to fetch data for {date_str}: {response.status_code}")
        
        current_date += timedelta(days=1)
        time.sleep(1)  # Respect API rate limits
    
    return pd.DataFrame(news_data)

news_df = get_news_data(start_date, end_date)

Fetching data for: 2022-10-01
Fetching data for: 2022-10-02
Fetching data for: 2022-10-03
Fetching data for: 2022-10-04
Fetching data for: 2022-10-05
Fetching data for: 2022-10-06
Fetching data for: 2022-10-07
Fetching data for: 2022-10-08
Fetching data for: 2022-10-09
Fetching data for: 2022-10-10
Fetching data for: 2022-10-11
Fetching data for: 2022-10-12
Fetching data for: 2022-10-13
Fetching data for: 2022-10-14
Fetching data for: 2022-10-15
Fetching data for: 2022-10-16
Fetching data for: 2022-10-17
Fetching data for: 2022-10-18
Fetching data for: 2022-10-19
Fetching data for: 2022-10-20
Fetching data for: 2022-10-21
Fetching data for: 2022-10-22
Fetching data for: 2022-10-23
Fetching data for: 2022-10-24
Fetching data for: 2022-10-25
Fetching data for: 2022-10-26
Fetching data for: 2022-10-27
Fetching data for: 2022-10-28
Fetching data for: 2022-10-29
Fetching data for: 2022-10-30
Fetching data for: 2022-10-31
Fetching data for: 2022-11-01
Fetching data for: 2022-11-02
Fetching d

In [5]:
news_df.head(), news_df.tail()

(         date                                            content
 0  2022-10-01  <p>Will it be a merry Christmas? It will certa...
 1  2022-10-01  <p>The owner of British Steel, the UK’s second...
 2  2022-10-01  <p>A former steelworks in Redcar has been pull...
 3  2022-10-01  <p>Thousands of supporters of Just Stop Oil ha...
 4  2022-10-01  <p>‘You haven’t been in the office this week. ...,
             date                                            content
 7841  2024-12-02  <div id="block-674d5f3f8f08613772568f0c" class...
 7842  2024-12-02  <p>Global carbon emissions would be 6% lower t...
 7843  2024-12-02  <p>Workers at Volkswagen factories in Germany ...
 7844  2024-12-02  <p>More than 1,500 Woolworths warehouse worker...
 7845  2024-12-02  <p>Growth expectations among UK companies have...)

In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Download the VADER lexicon (only needed once)
nltk.download('vader_lexicon')

# Initialize the Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Define the sentiment analysis function
def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']  # Use compound score for overall sentiment

# Apply the function to your news DataFrame
news_df['sentiment'] = news_df['content'].apply(analyze_sentiment)
print(news_df.head())


[nltk_data] Downloading package vader_lexicon to C:\Users\RUTHVIK
[nltk_data]     REDDY\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


         date                                            content  sentiment
0  2022-10-01  <p>Will it be a merry Christmas? It will certa...     0.6407
1  2022-10-01  <p>The owner of British Steel, the UK’s second...     0.9946
2  2022-10-01  <p>A former steelworks in Redcar has been pull...     0.9603
3  2022-10-01  <p>Thousands of supporters of Just Stop Oil ha...     0.9480
4  2022-10-01  <p>‘You haven’t been in the office this week. ...     0.9923


In [37]:
csv_filename = "news_data.csv"
news_df.to_csv(csv_filename, index=False)
print(f"News data saved to {csv_filename}")

News data saved to news_data.csv


In [8]:
news_sentiment_avg = news_df.groupby('date')['sentiment'].mean().reset_index()
news_sentiment_avg.rename(columns={'sentiment': 'avg_sentiment'}, inplace=True)
print(news_sentiment_avg.head())


         date  avg_sentiment
0  2022-10-01       0.534844
1  2022-10-02       0.156289
2  2022-10-03       0.150150
3  2022-10-04       0.754340
4  2022-10-05       0.475950


In [9]:
csv_filename = "news_avg_sentiment_data.csv"
news_sentiment_avg.to_csv(csv_filename, index=False)
print(f"News data saved to {csv_filename}")

News data saved to news_avg_sentiment_data.csv


In [33]:
print("Stock DataFrame columns:", stock_data.columns)
print("News Sentiment DataFrame columns:", news_sentiment_avg.columns)


Stock DataFrame columns: Index(['Date', 'Close'], dtype='object')
News Sentiment DataFrame columns: Index(['date', 'avg_sentiment'], dtype='object')


In [35]:
df = pd.read_csv("stock_sentiment_data.csv")

df.drop(columns=['date', 'avg_sentiment'], inplace=True)

In [36]:
df.to_csv("stock_sentiment_data.csv", index=False)

In [42]:
import pandas as pd

# Load the data
stocks_data = pd.read_csv('stock_data.csv', parse_dates=['Date'])
news_sentiment_data = pd.read_csv('news_avg_sentiment_data.csv', parse_dates=['date'])

# Rename the 'date' column in news_sentiment_data to 'Date' for consistency
news_sentiment_data.rename(columns={'date': 'Date'}, inplace=True)

# Ensure unique dates by dropping duplicates (if any)
stocks_data = stocks_data.drop_duplicates(subset=['Date'])
news_sentiment_data = news_sentiment_data.drop_duplicates(subset=['Date'])

# Merge the dataframes based on news dates ('Date' from news_sentiment_data)
merged_data = pd.merge(news_sentiment_data, stocks_data, on='Date', how='left')

# Forward fill missing stock values (using the previous day's stock value)
merged_data['Close'] = merged_data['Close'].fillna(method='ffill')

# Remove any duplicate dates that might have appeared after merging
merged_data = merged_data.drop_duplicates(subset=['Date'])

# Save the result to a new CSV file
merged_data.to_csv('merged_stock_sentiment_data.csv', index=False)

print("Merged data has been saved to 'merged_stock_sentiment_data.csv'.")


Merged data has been saved to 'merged_stock_sentiment_data.csv'.


  merged_data['Close'] = merged_data['Close'].fillna(method='ffill')
