In [1]:
import urllib.request
import bs4 as bs
import time
import datetime

import pandas as pd
import numpy as np
import yfinance as yf

In [2]:
t_df = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

num_tickers = 500

tickers = list(t_df["Symbol"][0:num_tickers])

#tickers = ["TSLA",  "EBAY", "META" ]   # for test
#print(tickers)

In [None]:
#STOCK

sleep_time  = 5

def extract_dates(parsed_doc):
    from math import floor
    today = datetime.datetime.now().date()
    dates = parsed_doc.find_all('pubdate')
    list_date = []
    for date  in dates:
        d = datetime.datetime.strptime( date.text, "%a, %d %b %Y %H:%M:%S GMT" )
        list_date += [floor((today - d.date()).days / 1)]                          #.days
    return list_date


def get_titles(ticker, sleep_time):
    '''
    This function return the url response from google news and parsed html for a stock ticker.
    The time parameter is the sleep time before each request
    '''
    url = 'https://news.google.com/rss/search?hl=en-US&q='+ticker+'%20stock&gl=US&ceid=US:en'

    time.sleep(sleep_time) ## wait 15 seconds between each request. This is SUPER IMPORTANT otherwise your IP-address will be banned for sending too frequent requests.

    doc = urllib.request.urlopen(url).read()
    parsed_doc = bs.BeautifulSoup(doc,'lxml')
    titles = parsed_doc.find_all('title')[1:]
    list_date = extract_dates(parsed_doc)                 # we  assume/have verified that the date are extracted in the same order as the titles
    dct = {"Ticker": [ ticker ]*len(titles) , "Titles" : titles, "Date" : list_date}
    title_df = pd.DataFrame(dct)
    title_df = title_df.sort_values(by="Date")
    title_df = title_df[title_df["Date"]==0]
    return title_df

list_df = []                        #dictionary with input data
for ticker in tickers:
    title = get_titles(ticker, sleep_time)
    list_df.append(title)
    print(title)

#input_df = pd.DataFrame(dct.values(),index = tickers,columns=["Titles"])

In [4]:
input_df = pd.concat(list_df)
input_df.shape
print(input_df)

   Ticker                                             Titles  Date
0     MMM  [3M Stock: A Deep Dive Into Analyst Perspectiv...     0
2     MMM  [Advisory Services Network LLC Sells 7897 Shar...     0
8     MMM  [Barclays Boosts 3M (NYSE:MMM) Price Target to...     0
0     AOS  [Why A.O. Smith (AOS) is a Top Momentum Stock ...     0
1     AOS  [A. O. Smith Co. (NYSE:AOS) Holdings Trimmed b...     0
..    ...                                                ...   ...
0     XYL  [D.A. Davidson & CO. Increases Stock Position ...     0
0     YUM  [TD Cowen reaffirms Yum! Brands stock buy rati...     0
1     YUM  [TD Cowen Reiterates Buy Rating for Yum! Brand...     0
2     YUM  [(YUM) Proactive Strategies - Stock Traders Da...     0
10    YUM  [Commerce Bank Has $2.67 Million Stock Positio...     0

[1530 rows x 3 columns]


In [5]:
sleep_time = 5
threshold = 0.01

def get_var_prc(ticker, sleep_time):
    tickerData = yf.Ticker(ticker)
    time.sleep(sleep_time)  # Wait between requests
    df = tickerData.history()
    if len(df) < 2:
        print(f"Insufficient data for {ticker}")
        return None
    df['Change'] = df['Close'].diff()
    return df["Change"].iloc[-1] / df["Close"].iloc[-2]  # Use iloc for integer indexing

def get_target(x):
    if x < -threshold:
        return -1
    elif abs(x) < threshold:
        return 0
    else:
        return 1



# Add a new column 'Var_prc' and assign values to it
for ticker in input_df['Ticker']:
    var_prc = get_var_prc(ticker, sleep_time)
    if var_prc is not None:  # Check if data is available
        input_df.loc[input_df["Ticker"] == ticker, 'Var_prc'] = var_prc
        input_df.loc[input_df["Ticker"] == ticker, 'Target'] = input_df[input_df["Ticker"] == ticker]['Var_prc'].apply(get_target)
print(input_df)

BF.B: No price data found, symbol may be delisted (period=1mo)


Insufficient data for BF.B
   Ticker                                             Titles  Date   Var_prc  \
0     MMM  [3M Stock: A Deep Dive Into Analyst Perspectiv...     0 -0.009933   
2     MMM  [Advisory Services Network LLC Sells 7897 Shar...     0 -0.009933   
8     MMM  [Barclays Boosts 3M (NYSE:MMM) Price Target to...     0 -0.009933   
0     AOS  [Why A.O. Smith (AOS) is a Top Momentum Stock ...     0 -0.026421   
1     AOS  [A. O. Smith Co. (NYSE:AOS) Holdings Trimmed b...     0 -0.026421   
..    ...                                                ...   ...       ...   
0     XYL  [D.A. Davidson & CO. Increases Stock Position ...     0 -0.026967   
0     YUM  [TD Cowen reaffirms Yum! Brands stock buy rati...     0 -0.013004   
1     YUM  [TD Cowen Reiterates Buy Rating for Yum! Brand...     0 -0.013004   
2     YUM  [(YUM) Proactive Strategies - Stock Traders Da...     0 -0.013004   
10    YUM  [Commerce Bank Has $2.67 Million Stock Positio...     0 -0.013004   

    Target  

In [6]:
input_df.to_csv("data_2805_comma.csv")
input_df.to_csv("data_2805_semicolon.csv",sep=";")


In [7]:
#from google.colab import files
#files.download("data_2705.csv")
