<a href="https://colab.research.google.com/github/Kussil/CVX_Rice_project/blob/main/02_Cleaned_Data/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries and Clone Github

In [1]:
# Import Libraries
import os
from google.colab import userdata
import pandas as pd

In [2]:
# Import github token with google secrets thingy and clone git repository
GITHUB_TOKEN = userdata.get('github')
os.environ['GITHUB_TOKEN'] = GITHUB_TOKEN
!git clone https://{GITHUB_TOKEN}@github.com/Kussil/CVX_Rice_project.git

Cloning into 'CVX_Rice_project'...
remote: Enumerating objects: 1398, done.[K
remote: Counting objects: 100% (387/387), done.[K
remote: Compressing objects: 100% (195/195), done.[K
remote: Total 1398 (delta 207), reused 353 (delta 185), pack-reused 1011[K
Receiving objects: 100% (1398/1398), 149.83 MiB | 9.13 MiB/s, done.
Resolving deltas: 100% (798/798), done.
Updating files: 100% (1062/1062), done.


## Upload Investment Research Articles into DF

In [3]:
# Import Chevron and Other Majors CSVs
invest_df = pd.read_csv('CVX_Rice_project/01_Raw_Data/Investment_Research_ALLV2_Final.csv')

# Rename drop, and reorder columns
invest_df = invest_df.rename(columns={'Headline': 'Article Headline', 'Text': 'Article Text'})
invest_df = invest_df.drop(['Contributor', 'Date/Time'], axis=1)
invest_df['Source'] = 'Investment Research'
new_order = ['Source', 'Unique_ID', 'Ticker', 'Date', 'Article Headline', 'Article Text', 'URL']
invest_df = invest_df.reindex(columns=new_order)
display(invest_df.shape)
display(invest_df.head())

(4896, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol: COP ...",


## Upload Proquest News Articles into DF and Clean Data

In [4]:
# Import Files
proquest_df_cvx = pd.read_csv('CVX_Rice_project/01_Raw_Data/Proquest_Articles_CVX.csv')
proquest_df_xom = pd.read_csv('CVX_Rice_project/01_Raw_Data/ProQuest_Articles_XOM.csv')
proquest_df_others = pd.read_csv('CVX_Rice_project/01_Raw_Data/ProQuest_Articles_FINAL_BP_COP_OXY_SHEL.csv')
proquest_df_others2 = pd.read_csv('CVX_Rice_project/01_Raw_Data/ProQuest_Articles_FINAL_CXO_DVN_EOG_EQNR_HES_MPC_MRO_PSX_PXD_TTL.csv')

# Append dfs
proquest_df = pd.concat([proquest_df_cvx,
                         proquest_df_xom,
                         proquest_df_others,
                         proquest_df_others2
                         ], ignore_index=True)
display(proquest_df_cvx.shape)
display(proquest_df_xom.shape)
display(proquest_df_others.shape)
display(proquest_df_others2.shape)
display(proquest_df.shape)

# Rename drop, and reorder columns
proquest_df = proquest_df.rename(columns={'Title': 'Article Headline', 'Full Article Text': 'Article Text'})
proquest_df['Source'] = 'ProQuest'
proquest_df = proquest_df.reindex(columns=new_order)

display(proquest_df.head())

(261, 6)

(1478, 6)

(1445, 6)

(932, 6)

(4116, 6)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",Turn on search term navigationTurn on search t...,https://www.proquest.com/newspapers/shell-trie...


In [5]:
# Delete rows with missing article text
search_text_1 = 'Failed to load content: Message:'
search_text_2 = 'Relevant content not found within the specified range.'
search_text_3 = 'TVM:UNDEFINED'

# Count the number of rows containing either search text
count_rows = proquest_df[
    proquest_df['Article Text'].str.contains(search_text_1, na=False) |
    proquest_df['Article Text'].str.contains(search_text_2, na=False) |
    proquest_df['Article Text'].str.contains(search_text_3, na=False)
].shape[0]

# Delete the rows containing either search text
proquest_df = proquest_df[
    ~proquest_df['Article Text'].str.contains(search_text_1, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_2, na=False) &
    ~proquest_df['Article Text'].str.contains(search_text_3, na=False)
]

# Print the number of rows with missing article text and the new shape of the DataFrame
print(f"Number of rows with missing article text: {count_rows}")
print(f"New DataFrame shape: {proquest_df.shape}")
print()

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

Number of rows with missing article text: 233
New DataFrame shape: (3883, 7)

The shortest article headline is: 'BP PLC'

The shortest article text is: 'Turn on search term navigationTurn on search term navigation
| Jump to first hitOKLAHOMA CITY Devon Energy Corp.’s board of directors has declared a quarterly cash dividend of 9 cents per share on common stock for the first quarter of 2020. The dividend is payable on March 31 based on a record date of March 13.
'


In [6]:
# Clean extra text from the beginning of the article
def clean_text(text):
    if text.startswith("Turn on search term"):
            return text[80:].strip()
    return text
proquest_df['Article Text'] = proquest_df['Article Text'].apply(clean_text)

# Confirm data is good by looking for short article headlines
shortest_headline = proquest_df.loc[proquest_df['Article Headline'].str.len().idxmin(), 'Article Headline']
print(f"The shortest article headline is: '{shortest_headline}'")
print()

# Confirm data is good by looking for short article text
shortest_text = proquest_df.loc[proquest_df['Article Text'].str.len().idxmin(), 'Article Text']
print(f"The shortest article text is: '{shortest_text}'")

display(proquest_df.head())

The shortest article headline is: 'BP PLC'

The shortest article text is: 'OKLAHOMA CITY Devon Energy Corp.’s board of directors has declared a quarterly cash dividend of 9 cents per share on common stock for the first quarter of 2020. The dividend is payable on March 31 based on a record date of March 13.'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  proquest_df['Article Text'] = proquest_df['Article Text'].apply(clean_text)


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,ProQuest,2703147472,CVX,17-Aug-22,Oil Giants Must Face Climate-Liability Suits i...,Chevron Corp.\nSpending projections are slidin...,https://www.proquest.com/newspapers/chevron-ge...
1,ProQuest,2702197510,CVX,15-Aug-22,Warren Buffett's Berkshire Hathaway Keeps Spen...,Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
2,ProQuest,2697049245,CVX,2-Aug-22,"Investors Put Forward More Proposals, Dialing ...",Chevron Corp. said it would cut its annual cap...,https://www.proquest.com/newspapers/chevron-sl...
3,ProQuest,2696371267,CVX,30-Jul-22,Soaring Fuel Prices Catapult Oil Giants to Rec...,SENATORS aired misgivings on Tuesday over pote...,https://www.proquest.com/newspapers/senators-a...
4,ProQuest,2695835797,CVX,29-Jul-22,"Exxon, Chevron, Shell Report Record Profits on...",LONDON—Royal Dutch Shell PLC said it would sta...,https://www.proquest.com/newspapers/shell-trie...


In [7]:
# Look for duplicate rows
duplicates = proquest_df[proquest_df.duplicated(subset=['Ticker', 'Date', 'Article Headline'], keep=False)]
duplicate_count = duplicates.shape[0]
print(f"Number of duplicate rows: {duplicate_count}")
display(duplicates.head(6))
print()

# Drop duplicates and keep the first occurrence
proquest_df = proquest_df.drop_duplicates(subset=['Ticker', 'Date', 'Article Headline'], keep='first')
display(proquest_df.shape)

Number of duplicate rows: 279


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
69,ProQuest,2774491352,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/business-n...
70,ProQuest,2533063981,CVX,27-May-21,Chevron corp. investors back climate proposal;...,Russia is still finding a home for much of its...,https://www.proquest.com/newspapers/chevron-ce...
94,ProQuest,2483968356,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,[Financial Analysis and Commentary]\nChevron w...,https://www.proquest.com/newspapers/exchange-h...
95,ProQuest,2483951085,CVX,30-Jan-21,Chevron posts fourth-quarter loss on weak refi...,Chevron Corp. had its most profitable year sin...,https://www.proquest.com/newspapers/exchange-c...
105,ProQuest,2455124806,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,Michael Wirth Chevron Corporation has issued a...,https://www.proquest.com/newspapers/chevron-se...
106,ProQuest,2455119828,CVX,28-Oct-20,Chevron to lay off about 25% of noble energy e...,"Chevron Corp. committed to an ""aspiration"" of ...",https://www.proquest.com/newspapers/chevron-em...





(3708, 7)

## Upload SEC Filings into DF

In [8]:
# Import Chevron and Other Majors CSVs
sec_df = pd.read_csv('CVX_Rice_project/01_Raw_Data/SEC_filings.csv')

# # Rename drop, and reorder columns
sec_df = sec_df.rename(columns={'AccessionNumber': 'Unique_ID', 'Form': 'Article Headline', 'Text': 'Article Text'})
sec_df['Source'] = 'SEC Filings'
sec_df = sec_df.reindex(columns=new_order)
display(sec_df.shape)
display(sec_df.head())

(1036, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,SEC Filings,114036119010687,BP,2019-06-07,11-K,\n\n11-K\n\n1\n\nform11k.htm\n\n11-K\n\n\n\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
1,SEC Filings,31380720000008,BP,2020-06-23,11-K,\n\n11-K\n\n1\n\na2019form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
2,SEC Filings,31380721000009,BP,2021-06-11,11-K,\n\n11-K\n\n1\n\na2020form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
3,SEC Filings,31380722000025,BP,2022-06-10,11-K,\n\n11-K\n\n1\n\na2021form11-k.htm\n\n11-K\n\n...,https://www.sec.gov/Archives/edgar/data/000031...
4,SEC Filings,31380723000020,BP,2023-06-13,11-K,\n\n11-K\n\n1\n\na2022form11-k1.htm\n\n11-K\n\...,https://www.sec.gov/Archives/edgar/data/000031...


## Concatenate Sources and Export to CSV

In [9]:
# Concatenate
text_df = pd.concat([invest_df, proquest_df, sec_df], ignore_index=True)
display(text_df.shape)
display(text_df.head())
display(text_df.tail())

(9640, 7)

Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
0,Investment Research,,MRO,2024-05-16,Marathon Oil Corporation,"Stock Report | May 16, 2024 | NYSESymbol: MRO ...",
1,Investment Research,,EOG,2024-05-14,"EOG Resources, Inc.","Stock Report | May 14, 2024 | NYSESymbol: EOG ...",
2,Investment Research,,EOG,2024-05-11,"EOG Resources, Inc.","Stock Report | May 11, 2024 | NYSESymbol: EOG ...",
3,Investment Research,,DVN,2024-05-11,Devon Energy Corporation,"Stock Report | May 11, 2024 | NYSESymbol: DVN ...",
4,Investment Research,,COP,2024-05-07,ConocoPhillips,"Stock Report | May 07, 2024 | NYSESymbol: COP ...",


Unnamed: 0,Source,Unique_ID,Ticker,Date,Article Headline,Article Text,URL
9635,SEC Filings,3408824000000.0,XOM,2024-04-03,8-K,\n\n\n\n\n\n\n\n\n\nxom-20240403\nFALSE\n00000...,https://www.sec.gov/Archives/edgar/data/000003...
9636,SEC Filings,3408824000000.0,XOM,2024-04-26,8-K,\n\n\n\n\n\n\n\n\n\nxom-20240426\nFALSE\n00000...,https://www.sec.gov/Archives/edgar/data/000003...
9637,SEC Filings,3408824000000.0,XOM,2024-04-29,10-Q,\n\n\n\n\n\n\n\n\n\nxom-20240331\nFALSE\n00000...,https://www.sec.gov/Archives/edgar/data/000003...
9638,SEC Filings,95010320000000.0,XOM,2024-05-03,8-K,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.sec.gov/Archives/edgar/data/000003...
9639,SEC Filings,95010320000000.0,XOM,2024-05-10,8-K,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,https://www.sec.gov/Archives/edgar/data/000003...


In [10]:
# Get a count of articles by source and ticker
grouped_counts = text_df.groupby(['Source', 'Ticker']).size().reset_index(name='Count')
print(grouped_counts)

                 Source                 Ticker  Count
0   Investment Research                     BP    277
1   Investment Research                    COP    283
2   Investment Research                    CVX    363
3   Investment Research  Concho Resources Inc.    143
4   Investment Research                    DVN    279
5   Investment Research                    EOG    296
6   Investment Research                   EQNR      5
7   Investment Research                    HES    279
8   Investment Research                    MPC    346
9   Investment Research                    MRO    277
10  Investment Research                    OXY    279
11  Investment Research                   PDCE    305
12  Investment Research                    PSX    277
13  Investment Research                    PXD    320
14  Investment Research                   SHEL    281
15  Investment Research                    TTE    255
16  Investment Research                    VLO    293
17  Investment Research     

In [11]:
# Export as CSV
text_df.to_csv('/content/Consolidated_Text_Data.csv', index=False)

# Note: This export needs to be manually uploaded to Github.