<a href="https://colab.research.google.com/github/Kussil/CVX_Rice_project/blob/main/02_Cleaned_Data/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries and Clone Github

In [1]:
# Import Libraries
import os
from google.colab import userdata
import pandas as pd

In [2]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/CVX_Rice_project.git

Cloning into 'CVX_Rice_project'...
remote: Enumerating objects: 230, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 230 (delta 17), reused 14 (delta 9), pack-reused 200[K
Receiving objects: 100% (230/230), 85.23 MiB | 6.72 MiB/s, done.
Resolving deltas: 100% (117/117), done.
Updating files: 100% (19/19), done.


## Upload Investment Research Articles into DF

In [3]:
# Import Chevron and Other Majors CSVs
invest_df_cvx = pd.read_csv('CVX_Rice_project/01_Raw_Data/Investment Research-CVX.csv')
invest_df_majors = pd.read_csv('CVX_Rice_project/01_Raw_Data/Investment Research-Majors.csv')

# Append dfs
invest_df = pd.concat([invest_df_cvx, invest_df_majors], ignore_index=True)
display(invest_df_cvx.shape)
display(invest_df_majors.shape)
display(invest_df.shape)

# Rename drop, and reorder columns
invest_df = invest_df.rename(columns={'Date/Time': 'Date', 'Company': 'Ticker', 'Headline': 'Article Headline', 'Text': 'Article Text'})
invest_df = invest_df.drop(['Contributor', 'Analyst', 'Pages'], axis=1)
invest_df['Source'] = 'Investment Research'
new_order = ['Source', 'Unique_ID', 'Ticker', 'Date', 'Article Headline', 'Article Text', 'URL']
invest_df = invest_df.reindex(columns=new_order)
display(invest_df.head())

(362, 7)

(2630, 7)

(2992, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,CVX,"May 13, 2024 10:05 PM",Chevron Corporation,"Stock Report | March 12, 2022 | NYSE Symbol: C...",
1,Investment Research,,CVX,"May 13, 2024 03:29 PM",CFRA LIFTS VIEW ON SHARES OF CHEVRON CORPORATI...,"Stock Report | August 05, 2023 | NYSE Symbol: ...",
2,Investment Research,,CVX,"May 11, 2024 06:00 PM",Chevron Corporation,"Stock Report | March 04, 2023 | NYSE Symbol: C...",
3,Investment Research,,CVX,"May 04, 2024 05:49 PM",Chevron Corporation,"Stock Report | October 17, 2020 | NYSE Symbol:...",
4,Investment Research,,CVX,"May 01, 2024 10:03 PM",Chevron Corporation,"Stock Report | December 25, 2021 | NYSE Symbol...",


## Upload Proquest News Articles into DF and Clean Data

In [4]:
# Import Files
proquest_df_cvx = pd.read_csv('CVX_Rice_project/01_Raw_Data/Proquest_Articles_CVX.csv')
proquest_df_xom = pd.read_csv('CVX_Rice_project/01_Raw_Data/ProQuest_Articles_XOM.csv')
# proquest_df_others = pd.read_csv('CVX_Rice_project/01_Raw_Data/proquest_newsarticles_all_v2.csv')

# Append dfs
proquest_df = pd.concat([proquest_df_cvx,
                         proquest_df_xom,
                        #  proquest_df_others
                         ], ignore_index=True)
display(proquest_df_cvx.shape)
display(proquest_df_xom.shape)
# display(proquest_df_others.shape)
display(proquest_df.shape)

# Rename drop, and reorder columns
proquest_df = proquest_df.rename(columns={'Title': 'Article Headline', 'Full Article Text': 'Article Text'})
proquest_df['Source'] = 'ProQuest'
proquest_df = proquest_df.reindex(columns=new_order)

display(proquest_df.head())

(261, 6)

(1478, 6)

(1739, 6)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/shell-trie...


In [5]:
# Delete rows with missing article text
search_text_1 = 'Failed to load content: Message:'
search_text_2 = 'Relevant content not found within the specified range.'
search_text_3 = 'TVM:UNDEFINED'

# Count the number of rows containing either search text
count_rows = proquest_df[
    proquest_df['Article Text'].str.contains(search_text_1, na=False) |
    proquest_df['Article Text'].str.contains(search_text_2, na=False) |
    proquest_df['Article Text'].str.contains(search_text_3, na=False)
].shape[0]

# Delete the rows containing either search text
proquest_df = proquest_df[
    ~proquest_df['Article Text'].str.contains(search_text_1, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_2, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_3, na=False)
]

# Print the number of rows with missing article text and the new shape of the DataFrame
print(f"Number of rows with missing article text: {count_rows}")
print(f"New DataFrame shape: {proquest_df.shape}")
print()

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

Number of rows with missing article text: 82
New DataFrame shape: (1657, 7)

The shortest article headline is: 'Correction'

The shortest article text is: 'Turn on search term navigationTurn on search term navigation
| Jump to first hitEXXON Mobil has revealed full-year profits of $23bn (£17bn), following a strong fourthquarter of trading amid successful cost cutting measures and share buyback plans. The results will be relief for the fossil fuel giant amid sustained activist pressure.'


In [6]:
# Clean extra text from the beginning of the article
def clean_text(text):
    if text.startswith("Turn on search term"):
            return text[80:].strip()
    return text
proquest_df['Article Text'] = proquest_df['Article Text'].apply(clean_text)

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

display(proquest_df.head())

The shortest article headline is: 'Correction'

The shortest article text is: 'EXXON Mobil has revealed full-year profits of $23bn (£17bn), following a strong fourthquarter of trading amid successful cost cutting measures and share buyback plans. The results will be relief for the fossil fuel giant amid sustained activist pressure.'


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Chevron Corp.\nSpending projections are slidin...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,SENATORS aired misgivings on Tuesday over pote...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",LONDON—Royal Dutch Shell PLC said it would sta...,https://www.proquest.com/newspapers/shell-trie...


In [7]:
# Look for duplicate rows
duplicates = proquest_df[proquest_df.duplicated(subset=['Ticker', 'Date', 'Article Headline'], keep=False)]
duplicate_count = duplicates.shape[0]
print(f"Number of duplicate rows: {duplicate_count}")
display(duplicates.head(6))
print()

# Drop duplicates and keep the first occurrence
proquest_df = proquest_df.drop_duplicates(subset=['Ticker', 'Date', 'Article Headline'], keep='first')
display(proquest_df.shape)

Number of duplicate rows: 193


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
69,ProQuest,2774491352,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/business-n...
70,ProQuest,2533063981,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/chevron-ce...
94,ProQuest,2483968356,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,[Financial Analysis and Commentary]\nChevron w...,https://www.proquest.com/newspapers/exchange-h...
95,ProQuest,2483951085,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,Chevron Corp. had its most profitable year sin...,https://www.proquest.com/newspapers/exchange-c...
105,ProQuest,2455124806,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,Michael Wirth Chevron Corporation has issued a...,https://www.proquest.com/newspapers/chevron-se...
106,ProQuest,2455119828,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,"Chevron Corp. committed to an ""aspiration"" of ...",https://www.proquest.com/newspapers/chevron-em...





(1530, 7)

In [8]:
# Tail by itself so you can examine the table with the colab table button
display(proquest_df.tail(20))

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
1716,ProQuest,2467551252,XOM,7-Dec-20,Kodak Lyft Exxon Mobil What Watch When Stock,Here's what we are watching as markets kick in...,https://www.proquest.com/newspapers/kodak-lyft...
1717,ProQuest,2467525973,XOM,7-Dec-20,Exxon Faces Activist Pressure,Exxon Mobil Corp. is facing the threat of a pr...,https://www.proquest.com/newspapers/exxon-face...
1718,ProQuest,2467506874,XOM,7-Dec-20,Exxon Under Pressure New Activist Fund Engine No,"An Exxon Mobil plant in Baytown, Texas. PHOTO:...",https://www.proquest.com/newspapers/exxon-unde...
1719,ProQuest,2466258188,XOM,3-Dec-20,Chevron Slashes Spending Plans As Coronavirus,Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
1720,ProQuest,2466104230,XOM,3-Dec-20,Exxonmobil Talks Buy Into Producing Oil Fields,first hitEnergy giant ExxonMobil is in talks t...,https://www.proquest.com/newspapers/exxonmobil...
1721,ProQuest,2466104062,XOM,3-Dec-20,Exxonmobil Talks Buy Into Producing Oil Amp Gas,first hitEnergy giant ExxonMobil is in talks t...,https://www.proquest.com/newspapers/exxonmobil...
1722,ProQuest,2465948139,XOM,2-Dec-20,Exxonmobil Plans 25Bn Spend Per Year 2025,ExxonMobil said in its business plan that it p...,https://www.proquest.com/newspapers/exxonmobil...
1723,ProQuest,2474392357,XOM,1-Dec-20,Exxon Slashes Capex Plans Will Write Off Assets,"ExxonMobil slashed its spending plans, postpon...",https://www.proquest.com/newspapers/exxon-slas...
1727,ProQuest,2465901936,XOM,1-Dec-20,Imperial Oil Take Up 1 2B Charge On Oilsands,Imperial Oil Ltd. will take an impairment char...,https://www.proquest.com/newspapers/imperial-o...
1728,ProQuest,2465821388,XOM,1-Dec-20,Exxon Is Retrenching Top Executive Defends,Exxon Mobil said late on Monday\nthat it would...,https://www.proquest.com/newspapers/exxon-is-r...


## Concatenate both DF Sources and Export to CSV

In [9]:
# Concatenate
text_df = pd.concat([invest_df, proquest_df], ignore_index=True)
display(text_df.shape)
display(text_df.head())

(4522, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,CVX,"May 13, 2024 10:05 PM",Chevron Corporation,"Stock Report | March 12, 2022 | NYSE Symbol: C...",
1,Investment Research,,CVX,"May 13, 2024 03:29 PM",CFRA LIFTS VIEW ON SHARES OF CHEVRON CORPORATI...,"Stock Report | August 05, 2023 | NYSE Symbol: ...",
2,Investment Research,,CVX,"May 11, 2024 06:00 PM",Chevron Corporation,"Stock Report | March 04, 2023 | NYSE Symbol: C...",
3,Investment Research,,CVX,"May 04, 2024 05:49 PM",Chevron Corporation,"Stock Report | October 17, 2020 | NYSE Symbol:...",
4,Investment Research,,CVX,"May 01, 2024 10:03 PM",Chevron Corporation,"Stock Report | December 25, 2021 | NYSE Symbol...",


In [10]:
# Get a count of articles by source and ticker
grouped_counts = text_df.groupby(['Source', 'Ticker']).size().reset_index(name='Count')
print(grouped_counts)

                 Source Ticker  Count
0   Investment Research    BP.    278
1   Investment Research    CVX    362
2   Investment Research   EQNR      6
3   Investment Research    HES    278
4   Investment Research    MPC    345
5   Investment Research    OXY    279
6   Investment Research    PSX    277
7   Investment Research   SHEL    282
8   Investment Research    TTE    256
9   Investment Research    VLO    292
10  Investment Research    XOM    337
11             ProQuest    CVX    220
12             ProQuest    XOM   1310


Adds????
*   ConocoPhillips (COP)
*   Devon Energy Corporation (DVN)
*   Pioneer Natural Resources Company (PXD)
*   Marathon Oil Corporation (MRO)

In [11]:
# Export as CSV
text_df.to_csv('/content/Consolidated_Text_Data.csv', index=False)

# Note: This export needs to be manually uploaded to Github.