## Step 1 News Data Cleaning

In [1]:
import pandas as pd



In [3]:
news_df = pd.read_csv('/Users/qlin/Desktop/sp500_focus_sources_1_23_dedup.csv')

In [6]:
# Count the number of unique 'content' values among the entire dataset
unique_entries = news_df['summary'].nunique()
print(f'There are {unique_entries} unique entries in the "summary" column.')

There are 64492 unique entries in the "summary" column.


In [5]:
# Drop the duplicated rows but keep the first occurence of each unique 'summary' value based on 'addDate' column
# First, sort the DataFrame by 'addDate'
df_sorted = news_df.sort_values('addDate')

# Then, drop duplicates based on 'content', keeping the first occurrence
df_dedup = df_sorted.drop_duplicates(subset='summary', keep='first')

# Check the number of rows in the deduplicated DataFrame
print(f'The deduplicated DataFrame has {df_dedup.shape[0]} rows.')

The deduplicated DataFrame has 64493 rows.


## Step 2 Filtering SP100 News Only

In [8]:
import pandas as pd
import ast

# Assume df_dedup is your DataFrame containing deduplicated news articles

# Step 1: Load the top 100 companies' tickers
with open('Data/Top_100_Companies_Tickers.txt', 'r') as file:
    top_100_tickers = file.read().splitlines()

# Step 2: Modify the function
def contains_top_100_company(companies_str):
    companies = ast.literal_eval(companies_str)  # Convert the string to a list
    for company in companies:
        if any(ticker in company['symbols'] for ticker in top_100_tickers):
            return True  # Return True if any of the top 100 companies' tickers is found
    return False  # Return False otherwise

In [9]:
# Select rows where 'symbols' contains any of the top 100 companies' tickers
df_top_100 = df_dedup[df_dedup['companies'].apply(contains_top_100_company)]

print(f'There are {len(df_top_100)} news articles related to the top 100 companies.')

There are 33925 news articles related to the top 100 companies.


In [45]:
# Filter only this columns: 'addDate', 'title', 'description', 'content', 'keywords', 'topics', 'entities', 'companies', 'summary'
news_top_100 = df_top_100[['addDate', 'title', 'entities', 'companies', 'summary']]

## Step 3 Find the Main Company Per News

In [60]:
import ast

def extract_companies_and_symbols(companies_str, top_100_tickers):
    companies = ast.literal_eval(companies_str)
    result = []
    for company in companies:
        matching_symbols = [symbol for symbol in company['symbols'] if symbol in top_100_tickers]
        if matching_symbols:
            result.append({'name': company['name'], 'symbols': matching_symbols})
    return result

with open('Data/Top_100_Companies_Tickers.txt', 'r') as file:
    top_100_tickers = file.read().splitlines()
    
# Apply the function to the 'companies' column
news_top_100['comp_ticker'] = news_top_100['companies'].apply(lambda x: extract_companies_and_symbols(x, top_100_tickers))

In [20]:
top_100_company = pd.read_csv("Data/Top_100_Companies.csv")
top_100_company

Unnamed: 0.1,Unnamed: 0,Ticker,Description,Sector,Market Capitalization
0,1,AAPL,Apple Inc.,Electronic Technology,2728017215293
1,2,MSFT,Microsoft Corporation,Technology Services,2351371643107
2,3,GOOG,Alphabet Inc.,Technology Services,1611856497958
3,4,GOOGL,Alphabet Inc.,Technology Services,1610343852181
4,5,AMZN,"Amazon.com, Inc.",Retail Trade,1366884251763
...,...,...,...,...,...
95,96,AMT,American Tower Corporation (REIT),Finance,82187878166
96,97,CB,Chubb Limited,Finance,82105914715
97,98,CI,The Cigna Group,Health Services,81693476157
98,99,C,"Citigroup, Inc.",Finance,81360929490


#### Calculate Mentions 

In [73]:
def calculate_mentions(comp_ticker, entities):
    mentions = {company['name']: 0 for company in comp_ticker}
    entity_counts = {}
    ''' 
    creates a dictionary mentions that maps each company name to a mention count of 0. Then, it iterates over the entities. 
    If an entity is of type 'ORG', it checks if the entity data is a substring of a company name or is in the company's symbols. 
    If it is, it updates the mention count for that company in the mentions dictionary with the maximum of the current mention count and the entity's mention count.
    '''
    # Normalize company names and symbols for comparison
    def normalize_name(name):
        return name.lower().replace('.', '').replace(',', '').replace('inc', '').replace('corp', '').strip()

    # Collect mentions counts for normalization
    for entity in entities:
        normalized_entity = normalize_name(entity['data'])
        entity_counts[normalized_entity] = entity['mentions']

    # Attempt to match entities to companies more flexibly
    for company in comp_ticker:
        company_name_normalized = normalize_name(company['name'])
        potential_matches = [company_name_normalized] + [normalize_name(symbol) for symbol in company['symbols']]
        
        # Initialize a list to store all matched mention counts
        matched_mentions = []

        for entity, count in entity_counts.items():
            # Check for direct match or if the entity name is a substring of the company name
            if entity in potential_matches or any(entity in match for match in potential_matches):
                matched_mentions.append(count)
                
        # If there are matched mentions, set the company's mentions to the highest one
        if matched_mentions:
            mentions[company['name']] = max(matched_mentions)
        # Otherwise, keep the default 0 value

    return mentions

# Given 'comp_ticker' and 'entities', apply the function
comp_ticker = [{'name': 'The Boeing Company', 'symbols': ['BA']}]
entities = [{'data': 'Boeing', 'type': 'ORG', 'mentions': 9}, {'data': 'NYSE', 'type': 'ORG', 'mentions': 1}, {'data': 'Bank of America', 'type': 'ORG', 'mentions': 2}, {'data': 'Susquehanna Financial Group', 'type': 'ORG', 'mentions': 3}, {'data': 'Morgan Stanley', 'type': 'ORG', 'mentions': 1}, {'data': '737 MAX', 'type': 'PRODUCT', 'mentions': 6}, {'data': '787', 'type': 'PRODUCT', 'mentions': 4}, {'data': 'Dreamliners', 'type': 'PRODUCT', 'mentions': 1}, {'data': 'Ronald Epstein', 'type': 'PERSON', 'mentions': 2}, {'data': 'Charles Minervino', 'type': 'PERSON', 'mentions': 1}]

# Calculate mentions
mentions = calculate_mentions(comp_ticker, entities)
mentions

# Apply the function to the 'comp_ticker' and 'entities' columns
news_top_100['mentions'] = news_top_100.apply(lambda row: calculate_mentions(row['comp_ticker'], row['entities']), axis=1)

In [79]:
news_top_100 = news_top_100[['addDate', 'title', 'entities', 'summary','parse_companies', 'comp_ticker', 'mentions']]

In [80]:
# save to csv
news_top_100.to_csv('/Users/qlin/Desktop/news_top_100.csv', index=False)

In [81]:
# Corrected function to handle ties without duplicating non-tie rows
def expand_rows_with_ties(news_top_100):
    expanded_data = []

    for index, row in news_top_100.iterrows():
        max_mentions = max(row['mentions'].values())
        tied_companies = [company for company, mentions in row['mentions'].items() if mentions == max_mentions]
        
        if len(tied_companies) > 1:
            # If there's a tie, duplicate the row for each company involved in the tie
            for company in tied_companies:
                new_row = row.copy()
                new_row['main_com'] = company
                expanded_data.append(new_row)
        else:
            # If there's no tie, just append the row as it is
            new_row = row.copy()
            new_row['main_com'] = tied_companies[0] if tied_companies else None
            expanded_data.append(new_row)
    
    return pd.DataFrame(expanded_data)


In [82]:
# Now apply the function to your DataFrame
expanded_df = expand_rows_with_ties(news_top_100)
len(expanded_df)

36406

In [85]:
# Sorting the expanded DataFrame alphabetically by the 'main_com' column
expanded_df['addDate'] = pd.to_datetime(expanded_df['addDate'])

expanded_news = expanded_df.sort_values(by=['main_com', 'addDate'], ascending=[True, True]).reset_index(drop=True)


In [88]:
# limit the data from 2023-01-01 to 2023-12-31 based on addDate column
expanded_news_limted = expanded_news[(expanded_news['addDate'] >= '2023-01-01') & 
                              (expanded_news['addDate'] <= '2023-12-31')]

In [93]:
expanded_news_limted = expanded_news_limted[['addDate', 'title', 'summary', 'main_com']]

In [102]:
# add a Month column to the expanded_news_limted DataFrame based on the 'addDate' column
expanded_news_limted['Month'] = expanded_news_limted['addDate'].dt.strftime('%B')

# save to csv
expanded_news_limted.to_csv('/Users/qlin/Desktop/expanded_news.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  expanded_news_limted['Month'] = expanded_news_limted['addDate'].dt.strftime('%B')


## Step 4 Using GPT to Summarize

In [135]:
expanded_news_limted

Unnamed: 0,addDate,title,summary,main_com,Month
42,2023-01-03 17:05:09.585710+00:00,AT&T Inc. (T) Is a Trending Stock: Facts to Kn...,"And if earnings estimates go up for a company,...",AT&T Inc.,January
43,2023-01-04 20:00:50.317817+00:00,AT&T rises as CFO Pascal says fiber join ventu...,AT&T rises as CFO Pascal says fiber join ventu...,AT&T Inc.,January
44,2023-01-05 12:22:49.284029+00:00,1 Top Dividend Stock to Buy for 2023 and Beyond,1 Top Dividend Stock to Buy for 2023 and Beyon...,AT&T Inc.,January
45,2023-01-05 20:09:04.026523+00:00,The Best Investing Advice for 2023,"\n\nEvery week, host and Zacks stock strategis...",AT&T Inc.,January
46,2023-01-08 13:08:33.480586+00:00,Top Picks 2023- ATT T,Management raised the mid-point of its 2022 ea...,AT&T Inc.,January
...,...,...,...,...,...
36401,2023-11-26 12:32:35.545066+00:00,Is Zoetis (NYSE:ZTS) Using Too Much Debt?,Zoetis Inc. (NYSE:ZTS) is using too much debt ...,Zoetis Inc.,November
36402,2023-12-12 16:28:12.516179+00:00,The Zacks Analyst Blog Highlights Invitation H...,Zacks.com has highlighted five stocks featured...,Zoetis Inc.,December
36403,2023-12-14 10:19:55.065065+00:00,Zoetis' (NYSE:ZTS) Upcoming Dividend Will Be L...,Zoetis Inc.'s (NYSE:ZTS) will increase its div...,Zoetis Inc.,December
36404,2023-12-22 12:24:57.671503+00:00,Is Now The Time To Put Zoetis (NYSE:ZTS) On Yo...,Zoetis (NYSE:ZTS) has been growing its earning...,Zoetis Inc.,December


In [136]:
# expanded_news_limted data exclude main_com = AT&T Inc.	AbbVie, Abbott Laboratories,Accenture plc, Adobe Inc., Advanced Micro Devices, Inc.
expanded_news_limted_exclude = expanded_news_limted[~expanded_news_limted['main_com'].isin(['AT&T Inc.', 'AbbVie Inc.', 'Abbott Laboratories','Accenture plc', 'Adobe Inc.', 'Advanced Micro Devices, Inc.'])]
len(expanded_news_limted_exclude)

29495

In [140]:
# collect only 'Alphabet Inc.' data from expanded_news_limted_exclude
expanded_news_limted_exclude_Alphabet = expanded_news_limted_exclude[expanded_news_limted_exclude['main_com'] == 'Alphabet Inc.']
expanded_news_limted_exclude_Alphabet 

Unnamed: 0,addDate,title,summary,main_com,Month
1807,2023-01-02 16:30:20.260512+00:00,"Big Tech will ‘have a better year’ in 2023, an...",\n\nThough it's been a bleak year for the sect...,Alphabet Inc.,January
1808,2023-01-03 10:47:55.766191+00:00,2 Remarkable Growth Stocks Set to Soar in 2023...,2 Remarkable Growth Stocks Set to Soar in 2023...,Alphabet Inc.,January
1809,2023-01-03 15:28:22.933077+00:00,Alphabet (GOOGL) Enhances Google Home App With...,Alphabet (GOOGL) Enhances Google Home App With...,Alphabet Inc.,January
1810,2023-01-03 16:46:08.645001+00:00,"Alphabet (GOOGL) Ups YouTube Efforts, Boosts G...","Alphabet (GOOGL) Ups YouTube Efforts, Boosts G...",Alphabet Inc.,January
1811,2023-01-03 16:53:14.148248+00:00,Wall Street Bulls Look Optimistic About Alphab...,"According to several studies, brokerage recomm...",Alphabet Inc.,January
...,...,...,...,...,...
2997,2023-12-29 13:03:33.061477+00:00,"Quiet Markets, Ugly Treasury Auction, Fed's Ag...",The S&P 500 gained less than two points or 0.0...,Alphabet Inc.,December
2998,2023-12-29 15:44:06.969417+00:00,If You Invested $1000 in Alphabet a Decade Ago...,Alphabet (GOOGL) is one of the most innovative...,Alphabet Inc.,December
2999,2023-12-29 15:52:36.569663+00:00,UCLA Wants to Buy Google’s Westside Pavilion O...,UCLA is reportedly planning to buy a former sh...,Alphabet Inc.,December
3000,2023-12-29 21:41:09.130704+00:00,'Trillion-dollar club' companies reach combine...,The combined market cap of US companies valued...,Alphabet Inc.,December


In [141]:
# Group by 'main_com' and 'month'
grouped = expanded_news_limted_exclude_Alphabet.groupby(['main_com', 'Month'])

In [129]:
# importing library
from openai._client import OpenAI

# Initialize the OpenAI client
key = input("Enter your OpenAI API key: ")

# instantiate the OpenAI client
client = OpenAI(api_key=key)


#### Custom Prompt

In [145]:
# Function to generate summaries using GPT-4
def generate_summary(client, company, news_summaries):

    # Define the system persona
    systemPersona = "You are an expert in finance and the technology industry."

    # Generate the prompt
    user_message = f"""
    As an expert in technology and financial industry news, your task is to distill key information from a collection of monthly news article summaries about a specific company. 
    You will provide a brief, integrated summary that captures the essence of the company's activities, milestones, and market presence during the month.

    The company in focus for this month's synthesis is {company}. Below are the key points extracted from news articles over the past month:

    {news_summaries}

    Please amalgamate this information into a comprehensive summary that includes:

    - Major events or announcements made by the company
    - Noteworthy financial occurrences or business dealings
    - Any significant changes in the company's strategy or market standing
    - Public or investor sentiment if mentioned in the articles

    Your summary should provide a clear overview of the company's news footprint for the month, highlighting any developments that could have a lasting impact on the company's trajectory.

    Generate a comprehensive summary no more than 150 words, ensuring it is succinct yet thorough enough to inform stakeholders of the company's monthly news highlights.

    Rely only on the details provided from the news summaries without incorporating external information.
    """
    # Call the API
    response = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": systemPersona},
            {"role": "user", "content": user_message}
        ]
    )
    # Return the summary provided by GPT-4
    return response.choices[0].message.content

### Collecting Output

In [146]:
# Collect the summaries into a new DataFrame
summary_records = []

for (company, month), group in grouped:
    # Concatenate all summaries for the company and month
    all_summaries = " ".join(group['summary'].tolist())
    # Generate the summary with GPT-4
    monthly_summary = generate_summary(client, company, all_summaries)
    # Append to the summary records
    summary_records.append({
        'company': company,
        'month': month,
        'monthly_summary': monthly_summary
    })

In [147]:
# Create a DataFrame from the summary records
summary_df = pd.DataFrame(summary_records)

In [149]:
comp6 = pd.read_csv("/Users/qlin/Desktop/summary.csv")
comp6

Unnamed: 0,company,month,monthly_summary
0,AT&T Inc.,April,AT&T Inc. faced potential market disruption bu...
1,AT&T Inc.,August,AT&T Inc. recently increased the monthly charg...
2,AT&T Inc.,December,"Over the past month, AT&T Inc. announced major..."
3,AT&T Inc.,February,AT&T has had a mixed month with various import...
4,AT&T Inc.,January,"AT&T, through its partnership with BlackRock, ..."
...,...,...,...
67,"Advanced Micro Devices, Inc.",March,"Advanced Micro Devices, Inc. (AMD) has had a n..."
68,"Advanced Micro Devices, Inc.",May,Advanced Micro Devices (AMD) released its Q1 r...
69,"Advanced Micro Devices, Inc.",November,Advanced Micro Devices (AMD) experienced mixed...
70,"Advanced Micro Devices, Inc.",October,Advanced Micro Devices (AMD) experienced a rob...


In [150]:
# combine com6 and summary_df
combined = pd.concat([comp6, summary_df], axis=0)

In [155]:
# Define the order
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

# Convert the 'month' column to a categorical type
combined['month'] = pd.Categorical(combined['month'], categories=months_order, ordered=True)

# Sort the DataFrame by the 'month' column but keep the 'company
combined = combined.sort_values(by=['company', 'month']).reset_index(drop=True)

# Reset the index of the DataFrame
combined = combined.reset_index(drop=True)

In [157]:
combined.to_csv('/Users/qlin/Desktop/summary.csv', index=False)