<a href="https://colab.research.google.com/github/Kussil/CVX_Rice_project/blob/main/02_Cleaned_Data/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries and Clone Github

In [1]:
# Import Libraries
import os
from google.colab import userdata
import pandas as pd

In [2]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/CVX_Rice_project.git

Cloning into 'CVX_Rice_project'...
remote: Enumerating objects: 218, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 218 (delta 13), reused 10 (delta 8), pack-reused 200[K
Receiving objects: 100% (218/218), 72.32 MiB | 7.97 MiB/s, done.
Resolving deltas: 100% (113/113), done.
Updating files: 100% (21/21), done.


## Upload Investment Research Articles into DF

In [3]:
# Import Chevron and Other Majors
invest_df_cvx = pd.read_csv('CVX_Rice_project/01_Raw_Data/Investment Research-CVX.csv')
invest_df_majors = pd.read_csv('CVX_Rice_project/01_Raw_Data/Investment Research-Majors.csv')

# Append dfs
invest_df = pd.concat([invest_df_cvx, invest_df_majors], ignore_index=True)
display(invest_df_cvx.shape)
display(invest_df_majors.shape)
display(invest_df.shape)

# Rename drop, and reorder columns
invest_df = invest_df.rename(columns={'Date/Time': 'Date', 'Company': 'Ticker', 'Headline': 'Article Headline', 'Text': 'Article Text'})
invest_df = invest_df.drop(['Contributor', 'Analyst', 'Pages'], axis=1)
new_order = ['Unique_ID', 'Ticker', 'Date', 'Article Headline', 'Article Text', 'URL']
invest_df = invest_df.reindex(columns=new_order)
display(invest_df.head())

# NOTE: NEED TO PUT IN A UNIQUE IDENTIFIED IF IT EXISTS

(362, 7)

(2630, 7)

(2992, 7)

Unnamed: 0,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,,CVX,"May 13, 2024 10:05 PM",Chevron Corporation,"Stock Report | March 12, 2022 | NYSE Symbol: C...",
1,,CVX,"May 13, 2024 03:29 PM",CFRA LIFTS VIEW ON SHARES OF CHEVRON CORPORATI...,"Stock Report | August 05, 2023 | NYSE Symbol: ...",
2,,CVX,"May 11, 2024 06:00 PM",Chevron Corporation,"Stock Report | March 04, 2023 | NYSE Symbol: C...",
3,,CVX,"May 04, 2024 05:49 PM",Chevron Corporation,"Stock Report | October 17, 2020 | NYSE Symbol:...",
4,,CVX,"May 01, 2024 10:03 PM",Chevron Corporation,"Stock Report | December 25, 2021 | NYSE Symbol...",


## Upload Proquest News Articles into DF and Clean Data

In [4]:
# Import Chevron and Others
proquest_df_cvx = pd.read_csv('CVX_Rice_project/01_Raw_Data/Proquest_Articles_CVX.csv')
proquest_df_xom = pd.read_csv('CVX_Rice_project/01_Raw_Data/ProQuest_Articles_XOM.csv')
proquest_df_others = pd.read_csv('CVX_Rice_project/01_Raw_Data/proquest_newsarticles_all_v2.csv')

# Append dfs
proquest_df = pd.concat([proquest_df_cvx, proquest_df_xom, proquest_df_others], ignore_index=True)
display(proquest_df_cvx.shape)
display(proquest_df_xom.shape)
display(proquest_df_others.shape)
display(proquest_df.shape)

# Rename drop, and reorder columns
proquest_df = proquest_df.rename(columns={'Title': 'Article Headline', 'Full Article Text': 'Article Text'})
proquest_df = proquest_df.reindex(columns=new_order)
display(proquest_df.head())

(261, 6)

(1478, 6)

(1500, 4)

(3239, 6)

Unnamed: 0,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,2703147000.0,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-ge...
1,2702198000.0,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
2,2697049000.0,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
3,2696371000.0,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/senators-a...
4,2695836000.0,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/shell-trie...


In [5]:
# Delete rows with missing article text
search_text_1 = 'Failed to load content: Message:'
search_text_2 = 'Relevant content not found within the specified range.'

# Count the number of rows containing either search text
count_rows = proquest_df[proquest_df['Article Text'].str.contains(search_text_1, na=False) |
                         proquest_df['Article Text'].str.contains(search_text_2, na=False)].shape[0]

# Delete the rows containing either search text
proquest_df = proquest_df[~proquest_df['Article Text'].str.contains(search_text_1, na=False) &
                          ~proquest_df['Article Text'].str.contains(search_text_2, na=False)]

# Print the number of rows with missing article text and the new shape of the DataFrame
print(f"Number of rows with missing article text: {count_rows}")
print(f"New DataFrame shape: {proquest_df.shape}")
print()

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

Number of rows with missing article text: 122
New DataFrame shape: (3117, 6)

The shortest article headline is: 'Lng Dash'

The shortest article text is: 'Turn on search term navigationTurn on search term navigation
| Jump to first hit_TVM:UNDEFINED_'


In [6]:
# Look for duplicate rows
duplicates = proquest_df[proquest_df.duplicated(subset=['Ticker', 'Date', 'Article Headline'], keep=False)]
duplicate_count = duplicates.shape[0]
print(f"Number of duplicate rows: {duplicate_count}")
print(duplicates)
print()

# Drop duplicates and keep the first occurrence
proquest_df = proquest_df.drop_duplicates(subset=['Ticker', 'Date', 'Article Headline'], keep='first')
display(proquest_df.shape)
display(proquest_df.head(20))
display(proquest_df.tail(20))

Number of duplicate rows: 433
         Unique_ID Ticker          Date  \
69    2.774491e+09    CVX     27-May-21   
70    2.533064e+09    CVX     27-May-21   
94    2.483968e+09    CVX     30-Jan-21   
95    2.483951e+09    CVX     30-Jan-21   
105   2.455125e+09    CVX     28-Oct-20   
...            ...    ...           ...   
3185           NaN    NaN  Mar 11, 2022   
3213           NaN    NaN   Mar 2, 2022   
3214           NaN    NaN   Mar 2, 2022   
3215           NaN    NaN   Mar 2, 2022   
3216           NaN    NaN   Mar 2, 2022   

                                       Article Headline  \
69    Chevron corp. investors back climate proposal;...   
70    Chevron corp. investors back climate proposal;...   
94    Chevron posts fourth-quarter loss on weak refi...   
95    Chevron posts fourth-quarter loss on weak refi...   
105   Chevron to lay off about 25% of noble energy e...   
...                                                 ...   
3185                           Russian G

(2831, 6)

Unnamed: 0,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,2703147000.0,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-ge...
1,2702198000.0,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
2,2697049000.0,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
3,2696371000.0,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/senators-a...
4,2695836000.0,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/shell-trie...
5,2680500000.0,CVX,25-Jun-22,EXCHANGE --- Business News: Chevron Plans to S...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-la...
6,2680392000.0,CVX,24-Jun-22,"Chevron to Cut Back in California, Remain Base...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-la...
7,2675050000.0,CVX,10-Jun-22,Is Chevron Stock A Buy Right Now? Here's What ...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-la...
8,2673506000.0,CVX,7-Jun-22,Business News: Chevron CEO Sees Russian Oil Ou...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/conoco-mov...
9,2672985000.0,CVX,4-Jun-22,Chevron CEO Sees Russian Oil Output Falling Af...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/conoco-mov...


Unnamed: 0,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
3213,,,"Mar 2, 2022",Cheniere Lng Super Cool,"""Anybody who wants gas in 2021-22 had better s...",https://www.proquest.com/newspapers/cheniere-l...
3217,,,"Mar 2, 2022",War Threatens Gas Cheap Energy Supply Brazil,The war in Ukraine may hinder the opening of t...,https://www.proquest.com/newspapers/war-threat...
3218,,,"Mar 2, 2022",United States Natural Gas Monthly Data December,"HighlightsDecember 2021In December 2021, dry n...",https://www.proquest.com/newspapers/united-sta...
3219,,,"Mar 1, 2022",Natural Gas Investments Hit 8 7Trn 2050,Natural gas can become the fuel of choice in s...,https://www.proquest.com/newspapers/natural-ga...
3220,,,"Mar 1, 2022",Lng Import At 32 Month Low On High Prices Rising,"New Delhi, February 28 STATES\nIndia’s liquefi...",https://www.proquest.com/newspapers/lng-import...
3221,,,"Feb 28, 2022",Europe Is Pivoting Away Russian Gas Why Cheniere,Russia's invasion of Ukraine has underscored m...,https://www.proquest.com/newspapers/europe-is-...
3222,,,"Feb 28, 2022",Energy Sanctions Are Weapon Putin Would,Sanctions against Vladimir Putin's war machine...,https://www.proquest.com/newspapers/energy-san...
3225,,,"Feb 28, 2022",Why Europe Must Break Dependency On Russian Gas,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/why-europe...
3226,,,"Feb 28, 2022",World Europes Reliance On Russian Fossil Fuels,Germany's vice chancellor is calling Russia's ...,https://www.proquest.com/newspapers/world-euro...
3227,,,"Feb 27, 2022",Russia Sends Natural Gas Tankers Kaliningrad,Russia holds most of the cards when it comes t...,https://www.proquest.com/newspapers/russia-sen...


## Concatenate both DF Sources and Export to CSV

In [7]:
# Concatenate
text_df = pd.concat([invest_df, proquest_df], ignore_index=True)
display(text_df.shape)
display(text_df.head())

(5823, 6)

Unnamed: 0,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,,CVX,"May 13, 2024 10:05 PM",Chevron Corporation,"Stock Report | March 12, 2022 | NYSE Symbol: C...",
1,,CVX,"May 13, 2024 03:29 PM",CFRA LIFTS VIEW ON SHARES OF CHEVRON CORPORATI...,"Stock Report | August 05, 2023 | NYSE Symbol: ...",
2,,CVX,"May 11, 2024 06:00 PM",Chevron Corporation,"Stock Report | March 04, 2023 | NYSE Symbol: C...",
3,,CVX,"May 04, 2024 05:49 PM",Chevron Corporation,"Stock Report | October 17, 2020 | NYSE Symbol:...",
4,,CVX,"May 01, 2024 10:03 PM",Chevron Corporation,"Stock Report | December 25, 2021 | NYSE Symbol...",


In [8]:
# Export as CSV
text_df.to_csv('/content/Consolidated_Text_Data.csv', index=False)

# Note: This export needs to be manually uploaded to Github.  Hopefully will figure out a way to automate this later