In [1]:
!pip install yfinance schedule requests pandas textblob joblib




[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
import pandas as pd
from textblob import TextBlob
import joblib
import schedule
import time
import yfinance as yf
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import schedule
import time

In [3]:
# Download nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muham\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
my_api_keys = '5b654d218f124beeab121df62f4c4371'

In [5]:
# Preprocess text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [6]:
# Analyze sentiment
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    return sentiment['neg'], sentiment['neu'], sentiment['pos']

def update_sentiment_analysis(df):
    df[['Negative', 'Nuetral', 'Positive']] = df['content'].apply(
        lambda x: pd.Series(analyze_sentiment(x)) if x else pd.Series([0, 0, 0])
    )
    return df


In [7]:
def fetch_news(api_key, query, from_date, to_date, page_size=100):
    url = f'https://newsapi.org/v2/everything?q={query}&from={from_date}&to={to_date}&pageSize={page_size}&apiKey={api_key}'
    response = requests.get(url)
    data = response.json()
    
    articles = data.get('articles', [])
    news_data = []
    
    for article in articles:
        # Parse publishedAt to datetime format and convert to UTC
        published_at = pd.to_datetime(article['publishedAt']).tz_convert('UTC')
        
        news_data.append({
            'source': article['source']['name'],
            'author': article['author'],
            'title': article['title'],
            'description': article['description'],
            'url': article['url'],
            'publishedAt': published_at,
            'content': article['content']
        })
    
    news_df = pd.DataFrame(news_data)
    news_df = news_df.sort_values(by='publishedAt').reset_index(drop=True)  # Sort by publishedAt in ascending order
    
    # Update sentiment analysis
    news_df = update_sentiment_analysis(news_df)


    return news_df

In [8]:
def fetch_stock_price(stock_symbol, from_stock_date, current_date):
    stock = yf.Ticker(stock_symbol)
    
    # Convert dates to datetime format if they are not already
    from_stock_date = pd.to_datetime(from_stock_date)
    current_date = pd.to_datetime(current_date)
    
    # Get historical data for the specified date range with weekly frequency
    historical_data = stock.history(start=from_stock_date, end=current_date, interval='1wk')
    
    # Extract 'Open' prices and reset index to get 'Date' as a column
    historical_data = historical_data[['Open']].reset_index()
    
    # Rename columns to match the required format
    historical_data.columns = ['Date', 'Stock Price']
    
    # Convert 'Date' to UTC to match 'publishedAt'
    historical_data['Date'] = historical_data['Date'].dt.tz_convert('UTC')
    
    # Sort by 'Date' in ascending order
    historical_data = historical_data.sort_values(by='Date').reset_index(drop=True)
    
    return historical_data

In [9]:
def clean_data(df):
    # Remove rows with missing values in critical columns
    critical_columns = ['publishedAt', 'Date', 'content', 'Stock Price']
    df = df.dropna(subset=critical_columns)
    
    # Remove duplicates
    df = df.drop_duplicates()
    
    # Ensure correct data types
    df['publishedAt'] = pd.to_datetime(df['publishedAt'], utc=True)
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    
    # Ensure final sort order by publishedAt
    df = df.sort_values(by='publishedAt', ascending=False).reset_index(drop=True)
    
    return df

In [10]:
# Save articles to CSV
def save_to_csv(df, file_path):
    df.to_csv(file_path, index=False)

### Usage Examples


In order for the program to work you need an API Key. Get your API Key from here: https://newsapi.org/

You can change the query example a company name

You can change the stock symbol

In [12]:
# Example usage
api_key = '' # you can register for an API Key from here: https://newsapi.org/
query = 'Visa Inc' # You can change query e.g. company name
from_date = datetime.now().strftime('%Y-%m-%d')
to_date = '2020-06-26'
from_stock_date = '2020-06-26'
current_date = datetime.now().strftime('%Y-%m-%d')
stock_symbol =  'V' # You can change the stock symbol 
output_csv = f'Update{stock_symbol}__{current_date}_data.csv'

# Fetch news data
news_df = fetch_news(api_key, query, from_date, to_date)

# Fetch stock price data
stock_price_df = fetch_stock_price(stock_symbol, from_stock_date, current_date)

# Merge stock price data into news_df
merged_df = pd.merge_asof(news_df.sort_values('publishedAt'), stock_price_df,
                          left_on='publishedAt', right_on='Date', direction='backward')

# Ensure final sort order by publishedAt
merged_df = merged_df.sort_values(by='publishedAt', ascending=False).reset_index(drop=True)

# Save merged data to CSV
save_to_csv(merged_df, output_csv)