<a href="https://colab.research.google.com/github/Kussil/Financial_Sentiment_LLM/blob/main/02_Cleaned_Data/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries and Clone Github

In [1]:
# Import Libraries
import os
from google.colab import userdata
import pandas as pd

In [2]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/Financial_Sentiment_LLM.git

Cloning into 'Financial_Sentiment_LLM'...
remote: Enumerating objects: 1620, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 1620 (delta 20), reused 12 (delta 12), pack-reused 1595[K
Receiving objects: 100% (1620/1620), 274.58 MiB | 8.26 MiB/s, done.
Resolving deltas: 100% (1103/1103), done.
Updating files: 100% (1105/1105), done.


## Upload Investment Research Articles into DF

In [3]:
# Import csvs
invest_df1 = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/Investment_Research_ALLV2_Final_Trimmed_1.csv')
invest_df2 = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/Investment_Research_ALLV2_Final_Trimmed_2.csv')
invest_df = pd.concat([invest_df1, invest_df2], axis=0, ignore_index=True)

# Rename drop, and reorder columns
invest_df = invest_df.rename(columns={'Headline': 'Article Headline', 'Text': 'Article Text'})
invest_df = invest_df.drop(['Contributor', 'Date/Time'], axis=1)
invest_df['Source'] = 'Investment Research'
new_order = ['Source', 'Unique_ID', 'Ticker', 'Date', 'Article Headline', 'Article Text', 'URL']
invest_df = invest_df.reindex(columns=new_order)

# Remove periods and trim whitespace
invest_df['Ticker'] = invest_df['Ticker'].str.replace('.', '', regex=False).str.strip()

display(invest_df.shape)
display(invest_df.head())

(4896, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol: COP ...",


## Upload Proquest News Articles into DF and Clean Data

In [4]:
# Import Files
proquest_df_cvx = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/Proquest_Articles_CVX.csv')
proquest_df_xom = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/ProQuest_Articles_XOM.csv')
proquest_df_others = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/ProQuest_Articles_FINAL_BP_COP_OXY_SHEL.csv')
proquest_df_others2 = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/ProQuest_Articles_FINAL_CXO_DVN_EOG_EQNR_HES_MPC_MRO_PSX_PXD_TTL.csv')

# Append dfs
proquest_df = pd.concat([proquest_df_cvx,
                         proquest_df_xom,
                         proquest_df_others,
                         proquest_df_others2
                         ], ignore_index=True)
display(proquest_df_cvx.shape)
display(proquest_df_xom.shape)
display(proquest_df_others.shape)
display(proquest_df_others2.shape)
display(proquest_df.shape)

# Rename drop, and reorder columns
proquest_df = proquest_df.rename(columns={'Title': 'Article Headline', 'Full Article Text': 'Article Text'})
proquest_df['Source'] = 'ProQuest'
proquest_df = proquest_df.reindex(columns=new_order)

# Replace TTL with TTE
proquest_df['Ticker'] = proquest_df['Ticker'].str.replace('TTL', 'TTE')

# Remove periods and trim whitespace
proquest_df['Ticker'] = proquest_df['Ticker'].str.replace('.', '', regex=False).str.strip()

display(proquest_df.head())

(261, 6)

(1478, 6)

(1445, 6)

(932, 6)

(4116, 6)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/shell-trie...


In [5]:
# Delete rows with missing article text
search_text_1 = 'Failed to load content: Message:'
search_text_2 = 'Relevant content not found within the specified range.'
search_text_3 = 'TVM:UNDEFINED'

# Count the number of rows containing either search text
count_rows = proquest_df[
    proquest_df['Article Text'].str.contains(search_text_1, na=False) |
    proquest_df['Article Text'].str.contains(search_text_2, na=False) |
    proquest_df['Article Text'].str.contains(search_text_3, na=False)
].shape[0]

# Delete the rows containing either search text
proquest_df = proquest_df[
    ~proquest_df['Article Text'].str.contains(search_text_1, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_2, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_3, na=False)
]

# Print the number of rows with missing article text and the new shape of the DataFrame
print(f"Number of rows with missing article text: {count_rows}")
print(f"New DataFrame shape: {proquest_df.shape}")
print()

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

Number of rows with missing article text: 233
New DataFrame shape: (3883, 7)

The shortest article headline is: 'BP PLC'

The shortest article text is: 'Turn on search term navigationTurn on search term navigation
| Jump to first hitOKLAHOMA CITY Devon Energy Corp.’s board of directors has declared a quarterly cash dividend of 9 cents per share on common stock for the first quarter of 2020. The dividend is payable on March 31 based on a record date of March 13.
'


In [6]:
# Clean extra text from the beginning of the article
def clean_text(text):
    """
    Cleans a given text by removing a specific leading substring and stripping whitespace.

    This function checks if the provided text starts with the substring "Turn on search term". If it does, the function
    removes the first 80 characters from the text and then strips any leading or trailing whitespace from the remaining text.
    If the text does not start with the specified substring, it returns the text unchanged.

    Parameters:
    text (str): The text to be cleaned.

    Returns:
    str: The cleaned text.
    """
    if text.startswith("Turn on search term"):
            return text[80:].strip()
    return text
proquest_df['Article Text'] = proquest_df['Article Text'].apply(clean_text)

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

display(proquest_df.head())

The shortest article headline is: 'BP PLC'

The shortest article text is: 'OKLAHOMA CITY Devon Energy Corp.’s board of directors has declared a quarterly cash dividend of 9 cents per share on common stock for the first quarter of 2020. The dividend is payable on March 31 based on a record date of March 13.'


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Chevron Corp.\nSpending projections are slidin...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,SENATORS aired misgivings on Tuesday over pote...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",LONDON—Royal Dutch Shell PLC said it would sta...,https://www.proquest.com/newspapers/shell-trie...


In [7]:
# Look for duplicate rows
duplicates = proquest_df[proquest_df.duplicated(subset=['Ticker', 'Date', 'Article Headline'], keep=False)]
duplicate_count = duplicates.shape[0]
print(f"Number of duplicate rows: {duplicate_count}")
display(duplicates.head(6))
print()

# Drop duplicates and keep the first occurrence
proquest_df = proquest_df.drop_duplicates(subset=['Ticker', 'Date', 'Article Headline'], keep='first')
display(proquest_df.shape)

Number of duplicate rows: 279


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
69,ProQuest,2774491352,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/business-n...
70,ProQuest,2533063981,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/chevron-ce...
94,ProQuest,2483968356,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,[Financial Analysis and Commentary]\nChevron w...,https://www.proquest.com/newspapers/exchange-h...
95,ProQuest,2483951085,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,Chevron Corp. had its most profitable year sin...,https://www.proquest.com/newspapers/exchange-c...
105,ProQuest,2455124806,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,Michael Wirth Chevron Corporation has issued a...,https://www.proquest.com/newspapers/chevron-se...
106,ProQuest,2455119828,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,"Chevron Corp. committed to an ""aspiration"" of ...",https://www.proquest.com/newspapers/chevron-em...





(3708, 7)

## Upload SEC Filings into DF

In [8]:
# Import Chevron and Other Majors CSVs
sec_df = pd.read_csv('Financial_Sentiment_LLM/01_Raw_Data/SEC_filings.csv')

# # Rename drop, and reorder columns
sec_df = sec_df.rename(columns={'AccessionNumber': 'Unique_ID', 'Form': 'Article Headline', 'Text': 'Article Text'})
sec_df['Source'] = 'SEC Filings'
sec_df = sec_df.reindex(columns=new_order)

# Remove periods and trim whitespace
sec_df['Ticker'] = sec_df['Ticker'].str.replace('.', '', regex=False).str.strip()

display(sec_df.shape)
display(sec_df.head())

(1031, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,SEC Filings,114036119010687,BP,2019-06-07,11-K,\nSECURITIES AND EXCHANGE COMMISSION\nWashingt...,https://www.sec.gov/Archives/edgar/data/000031...
1,SEC Filings,31380720000008,BP,2020-06-23,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
2,SEC Filings,31380721000009,BP,2021-06-11,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
3,SEC Filings,31380722000025,BP,2022-06-10,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...
4,SEC Filings,31380723000020,BP,2023-06-13,11-K,UNITED STATESSECURITIES AND EXCHANGE COMMISSIO...,https://www.sec.gov/Archives/edgar/data/000031...


## Upload Earnings Call Transcripts into DF

In [9]:
# Import csvs
earnings1 = pd.read_csv('/content/Financial_Sentiment_LLM/01_Raw_Data/Earnings Transcripts/Chevron Corporation NYSE CVX Transcripts_ALL_Final.csv')
earnings2 = pd.read_csv('/content/Financial_Sentiment_LLM/01_Raw_Data/Earnings Transcripts/EarningsTranscripts_DVN_HES_MPC_MRO_OXY_VLO_PSX_PDC_CXO_ALL_Final.csv')
earnings3 = pd.read_csv('/content/Financial_Sentiment_LLM/01_Raw_Data/Earnings Transcripts/EarningsTranscripts_XOM_TTE_COP_EOG_BP_SHEL_ALL_Final.csv')
earnings_presentations = pd.concat([earnings1, earnings2, earnings3], axis=0, ignore_index=True)

# Split into presentation and Q&A
earnings_qa = earnings_presentations.copy()
earnings_presentations = earnings_presentations.drop(['Question and Answer Text'], axis=1)
earnings_qa = earnings_qa.drop(['Presentation Text'], axis=1)

# Remove periods and trim whitespace
earnings_presentations['Ticker'] = earnings_presentations['Ticker'].str.replace('.', '', regex=False).str.strip()
earnings_qa['Ticker'] = earnings_qa['Ticker'].str.replace('.', '', regex=False).str.strip()

display(earnings_presentations.head(1))
display(earnings_qa.head(1))

Unnamed: 0,Date / Time,Title,Type,Date,Company,Ticker,Presentation Text
0,Apr-26-2024 11:00 AM,"Chevron Corporation, Q1 2024 Earnings Call, Ap...",Earnings Call,Apr-26-2024,Chevron Corporation,CVX,Chevron Corporation NYSE:CVX\nFQ1 2024 Earning...


Unnamed: 0,Date / Time,Title,Type,Date,Company,Ticker,Question and Answer Text
0,Apr-26-2024 11:00 AM,"Chevron Corporation, Q1 2024 Earnings Call, Ap...",Earnings Call,Apr-26-2024,Chevron Corporation,CVX,Question and Answer\nOperator\n[Operator Instr...


In [10]:
# Rename drop, and reorder columns
earnings_presentations = earnings_presentations.rename(columns={'Title': 'Article Headline', 'Presentation Text': 'Article Text'})
earnings_qa = earnings_qa.rename(columns={'Title': 'Article Headline', 'Question and Answer Text': 'Article Text'})

earnings_presentations = earnings_presentations.drop(['Type', 'Date / Time'], axis=1)
earnings_qa = earnings_qa.drop(['Type', 'Date / Time'], axis=1)

earnings_presentations['Source'] = 'Earnings Calls'
earnings_qa['Source'] = 'Earnings Calls'

earnings_presentations = earnings_presentations.reindex(columns=new_order)
earnings_qa = earnings_qa.reindex(columns=new_order)

display(earnings_presentations.shape)
display(earnings_presentations.head())
display(earnings_qa.shape)
display(earnings_qa.head())

(342, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Earnings Calls,,CVX,Apr-26-2024,"Chevron Corporation, Q1 2024 Earnings Call, Ap...",Chevron Corporation NYSE:CVX\nFQ1 2024 Earning...,
1,Earnings Calls,,CVX,Feb-02-2024,"Chevron Corporation, Q4 2023 Earnings Call, Fe...",Chevron Corporation NYSE:CVX\nFQ4 2023 Earning...,
2,Earnings Calls,,CVX,Oct-27-2023,"Chevron Corporation, Q3 2023 Earnings Call, Oc...",Chevron Corporation NYSE:CVX\nFQ3 2023 Earning...,
3,Earnings Calls,,CVX,Jul-28-2023,"Chevron Corporation, Q2 2023 Earnings Call, Ju...",Chevron Corporation NYSE:CVX\nFQ2 2023 Earning...,
4,Earnings Calls,,CVX,Apr-28-2023,"Chevron Corporation, Q1 2023 Earnings Call, Ap...",Chevron Corporation NYSE:CVX\nFQ1 2023 Earning...,


(342, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Earnings Calls,,CVX,Apr-26-2024,"Chevron Corporation, Q1 2024 Earnings Call, Ap...",Question and Answer\nOperator\n[Operator Instr...,
1,Earnings Calls,,CVX,Feb-02-2024,"Chevron Corporation, Q4 2023 Earnings Call, Fe...",Question and Answer\nOperator\n[Operator Instr...,
2,Earnings Calls,,CVX,Oct-27-2023,"Chevron Corporation, Q3 2023 Earnings Call, Oc...",Question and Answer\nOperator\n[Operator Instr...,
3,Earnings Calls,,CVX,Jul-28-2023,"Chevron Corporation, Q2 2023 Earnings Call, Ju...",Question and Answer\nOperator\n[Operator Instr...,
4,Earnings Calls,,CVX,Apr-28-2023,"Chevron Corporation, Q1 2023 Earnings Call, Ap...",Question and Answer\nOperator\n[Operator Instr...,


## Concatenate Sources and Export to CSV

In [11]:
# Concatenate
text_df = pd.concat([invest_df, proquest_df, sec_df, earnings_presentations, earnings_qa], ignore_index=True)
display(text_df.shape)
display(text_df.head())
display(text_df.tail())

(10319, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol: COP ...",


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
10314,Earnings Calls,,XOM,Feb-02-2021,"Exxon Mobil Corporation, Q4 2020 Earnings Call...",Question and Answer\nOperator\n[Operator Instr...,
10315,Earnings Calls,,COP,Feb-02-2021,"ConocoPhillips, Q4 2020 Earnings Call, Feb 02,...",Question and Answer\nOperator\n[Operator Instr...,
10316,Earnings Calls,,EOG,May-03-2019,"EOG Resources, Inc., Q1 2019 Earnings Call, Ma...",Question and Answer\nOperator\n[Operator Instr...,
10317,Earnings Calls,,SHEL,May-02-2019,"Royal Dutch Shell plc, Q1 2019 Earnings Call, ...",Question and Answer\nOperator\n[Operator Instr...,
10318,Earnings Calls,,COP,Apr-30-2019,"ConocoPhillips, Q1 2019 Earnings Call, Apr 30,...",Question and Answer\nOperator\n[Operator Instr...,


In [12]:
# Get a count of articles by source and ticker
grouped_counts = text_df.groupby(['Source', 'Ticker']).size().reset_index(name='Count')
pivot_df = grouped_counts.pivot(index='Ticker', columns='Source', values='Count')
pivot_df = pivot_df.fillna(0)
display(pivot_df)

Source,Earnings Calls,Investment Research,ProQuest,SEC Filings
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BP,44.0,277.0,499.0,5.0
COP,44.0,283.0,335.0,80.0
CVX,44.0,363.0,220.0,100.0
CXO,0.0,143.0,28.0,0.0
DVN,44.0,279.0,109.0,75.0
EOG,44.0,296.0,6.0,87.0
EQNR,0.0,5.0,422.0,0.0
HES,38.0,279.0,18.0,77.0
MPC,44.0,346.0,25.0,104.0
MRO,44.0,277.0,0.0,79.0


In [13]:
# Export as CSV
# Note the consolidated file is too big now, will need to be consolidated in a df when used
# text_df.to_csv('/content/Consolidated_Text_Data.csv', index=False)

# Split the investment research df due to size
midpoint = len(invest_df) // 2
invest_df1 = invest_df.iloc[:midpoint]
invest_df2 = invest_df.iloc[midpoint:]

# Exporting separate files
invest_df1.to_csv('/content/Investment_Research_Part1.csv', index=False)
invest_df2.to_csv('/content/Investment_Research_Part2.csv', index=False)
proquest_df.to_csv('/content/ProQuest_Articles.csv', index=False)
sec_df.to_csv('/content/SEC_Filings.csv', index=False)
earnings_presentations.to_csv('/content/Earnings_Presentations.csv', index=False)
earnings_qa.to_csv('/content/Earnings_QA.csv', index=False)

# Note: These exports needs to be manually uploaded to Github.