In [120]:
import requests
import time
import pandas as pd
from api_keys import api_key
import pprint as pprint

In [121]:
# Load the CSV file
file_path = 'Resourses/newsKeyWords.csv'  # Update this with the correct path to your CSV file
df = pd.read_csv(file_path)

# Assuming the tickers are in the second column (index 1)
tickers = df.iloc[:, 1].unique()  # Extract unique tickers to avoid duplicate API calls

tickers
# Limit to the first 100 tickers
# tickers_100 = tickers[200:]
# tickers_100

array(['WMT', 'AMZN', 'XOM', 'AAPL', 'UNH', 'CVS', 'BRKA', 'GOOGL', 'MCK',
       'CVX', 'ABC', 'COST', 'MSFT', 'CAH', 'CI', 'MPC', 'PSX', 'VLO',
       'F', 'HD', 'GM', 'ELV', 'JPM', 'KR', 'CNC', 'VZ', 'WBA', 'FNMA',
       'CMCSA', 'T', 'META', 'BAC', 'TGT', 'DELL', 'ADM', 'C', 'UPS',
       'PFE', 'LOW', 'JNJ', 'FDX', 'HUM', 'ET', 'FMCC', 'PEP', 'WFC',
       'DIS', 'COP', 'TSLA', 'PG', 'GE', 'ACI', 'MET', 'GS', 'SYY', 'RTX',
       'BA', 'SNEX', 'LMT', 'MS', 'INTC', 'HPQ', 'SNX', 'IBM', 'HCA',
       'PRU', 'CAT', 'MRK', 'INT', 'EPD', 'ABBV', 'PAGP', 'DOW', 'AIG',
       'AXP', 'CHTR', 'TSN', 'DE', 'CSCO', 'ALL', 'DAL', 'TJX', 'PGR',
       'AAL', 'PFGC', 'PBF', 'NKE', 'BBY', 'BMY', 'UAL', 'TMO', 'QCOM',
       'ABT', 'KO', 'ORCL', 'NUE', 'GD', 'COF', 'DINO', 'DG', 'ARW',
       'OXY', 'TRV', 'NOC', 'HON', 'MMM', 'USFD', 'WBD', 'LEN', 'DHI',
       'JBL', 'LNG', 'AVGO', 'KMX', 'SBUX', 'MOH', 'UBER', 'PM', 'NFLX',
       'NRG', 'MDLZ', 'DHR', 'CRM', 'PARA', 'CBRE', 'MU', 'V', 'SO',


In [122]:
# Generate the first business day for each month from January to December 2023
business_start_2023 = pd.date_range(start='2023-01-01', end='2023-12-31', freq='BMS')
business_start_2023 = business_start_2023.strftime('%Y-%m-%d').tolist()

# Generate the last business day for each month from January to December 2023
business_last_2023 = pd.date_range(start='2023-01-01', end='2023-12-31', freq='BM')
business_last_2023 = business_last_2023.strftime('%Y-%m-%d').tolist()

# Display the generated dates
print(business_start_2023)
print(business_last_2023)


['2023-01-02', '2023-02-01', '2023-03-01', '2023-04-03', '2023-05-01', '2023-06-01', '2023-07-03', '2023-08-01', '2023-09-01', '2023-10-02', '2023-11-01', '2023-12-01']
['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-28', '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31', '2023-09-29', '2023-10-31', '2023-11-30', '2023-12-29']


In [123]:
# Assuming tickers and business_days_2023 are already defined in your script

# Limit to the first 5 tickers
#tickers = tickers[:2]

# Limit to the first date
#business_start_2023 = business_start_2023[:4]
#business_last_2023 = business_last_2023[:4]
tickers[101]

'OXY'

In [124]:


# Function to fetch and process news data for each ticker
def fetch_news_data(tickers, start_dates, end_dates, api_key, df):
    # Define short names for each month
    month_names = [date[:7] for date in start_dates]  # Extract year-month from dates
    short_month_names = [date.strftime("%b") for date in pd.to_datetime(start_dates)]

    # Add columns for each short month name and initialize 'homepages' column
    for short_month in short_month_names:
        if short_month not in df.columns:
            df[short_month] = ''  # Initialize month columns
    if 'homepages' not in df.columns:
        df['homepages'] = ''  # Initialize homepages column

    for ticker in tickers:  # Processing the next 100 tickers
        homepages_combined = set()
        for start_date, end_date, short_month in zip(start_dates, end_dates, short_month_names):
            url = f"https://api.polygon.io/v2/reference/news?ticker={ticker}&published_utc.gt={start_date}&published_utc.lt={end_date}&limit=5&apiKey={api_key}"
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json().get('results', [])

                keywords_combined = []
                for item in data:
                    keywords = item.get('keywords', [])
                    keywords_combined.extend(keywords)

                    homepage_url = item.get('publisher', {}).get('homepage_url', '')
                    if homepage_url:
                        homepages_combined.add(homepage_url)
                
                # Safely retrieve existing keywords for the month
                if ticker in df['Ticker'].values and short_month in df.columns:
                    existing_keywords = df.loc[df['Ticker'] == ticker, short_month].values[0]
                    existing_keywords = existing_keywords if isinstance(existing_keywords, str) else ''
                else:
                    existing_keywords = ''

                # Join keywords with semicolons
                updated_keywords = '; '.join([existing_keywords, '; '.join(keywords_combined)]).strip('; ')
                df.loc[df['Ticker'] == ticker, short_month] = updated_keywords


            else:
                print(f"Failed to fetch news for {ticker}")
                print(response.text)

            time.sleep(12)  # Rate limiting

        # Safely update homepages for the ticker
        if ticker in df['Ticker'].values:
            existing_homepages = df.loc[df['Ticker'] == ticker, 'homepages'].values[0]
            existing_homepages = existing_homepages if isinstance(existing_homepages, str) else ''
        else:
            existing_homepages = ''

        updated_homepages = ' '.join(set(existing_homepages.split() + list(homepages_combined))).strip()
        df.loc[df['Ticker'] == ticker, 'homepages'] = updated_homepages

    return df

# Add new columns to the DataFrame
df['homepages'] = ''

# Fetch and process news data
df = fetch_news_data(tickers, business_start_2023, business_last_2023, api_key, df)




In [125]:
df.to_csv("Resourses/newsKeyWords.csv", index=False)

In [126]:
#df = df.drop(['homepages', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], axis=1)