In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import socket
import pandas as pd
import pdfplumber
import os
from tqdm import tqdm
import xlrd
import xlwt

In [2]:
# Load the combined search output CSV file
df = pd.read_csv('Investment Research/Investment_Research_All_V2.csv')

# Display the resulting DataFrame
print(df.head())

# Save the DataFrame to a new CSV file
df.to_csv('Investment Research/Investment_Research_All_V2.csv', index=False)

            Contributor              Analyst              Date/Time Company  \
0  CFRA Equity Research  Handshoe, Jonnathan  May 16, 2024 10:10 PM     MRO   
1  CFRA Equity Research    Glickman, Stewart  May 14, 2024 10:02 PM     EOG   
2  CFRA Equity Research    Glickman, Stewart  May 11, 2024 06:08 PM     EOG   
3  CFRA Equity Research    Glickman, Stewart  May 11, 2024 05:57 PM     DVN   
4  CFRA Equity Research    Glickman, Stewart  May 07, 2024 10:03 PM     COP   

                   Headline  Pages         Date               Headline2  \
0  Marathon Oil Corporation      9  16-May-2024  MarathonOilCorporation   
1       EOG Resources, Inc.      9  14-May-2024       EOGResources,Inc.   
2       EOG Resources, Inc.      9  11-May-2024       EOGResources,Inc.   
3  Devon Energy Corporation      9  11-May-2024  DevonEnergyCorporation   
4            ConocoPhillips      9   7-May-2024          ConocoPhillips   

   Unnamed: 8  
0         NaN  
1         NaN  
2         NaN  
3         

In [3]:
def extract_text_and_title(pdf_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = ""
            # Iterate over all pages
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        return text
    except Exception as e:
        print("Error reading " + pdf_path + ": " + str(e))
        return None

def extract_date_from_filename(filename):
    # Regex to match the date format in the filename
    date_pattern = r'(\w{3})_(\d{1,2}),_(\d{4})\.pdf'
    match = re.search(date_pattern, filename)
    if match:
        # Construct date string and return it
        return match.group(1) + " " + match.group(2) + ", " + match.group(3)
    else:
        return 'Date not found'
    
def extract_document_id(title):
    # Regex to match the document ID pattern
    doc_id_pattern = r'(\d+)[^_]*'
    match = re.search(doc_id_pattern, title)
    if match:
        return match.group(1)
    else:
        return 'ID not found'
    
def extract_join_key(filename):
    # Extract the part of the filename following the first _ and ending just prior to .pdf
    join_key_pattern = r'_(.*?)\.pdf'
    match = re.search(join_key_pattern, filename)
    if match:
        return match.group(1)
    else:
        return 'Join_Key not found'

# Directory containing PDFs
pdf_dir = 'Investment Research/PDFs_All_V2'
pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]

# Extract data from each PDF
data = []
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_dir, pdf_file)
    text = extract_text_and_title(pdf_path)
    if text is None:
        continue  # Skip files that couldn't be read
    title = pdf_file  # Use file name as title
    source = pdf_file.split('_')[0]  # Extract source from file name
    date = extract_date_from_filename(pdf_file)  # Extract date from filename
    unique_id = extract_document_id(title)
    join_key = extract_join_key(pdf_file)  # Extract join key from filename
    data.append({'Unique_ID': unique_id,'Join_Key': join_key, 'Date': date, 'Title': title, 'Source': source, 'Text': text})

# Create a DataFrame
df = pd.DataFrame(data)

# Ensure Join_Key is a string and clean it
def clean_join_key(key):
    key = re.sub(r'[^\w\s]', '', key.lower())
    key = re.sub(r'_(0)', '_', key)  # Remove '0' following the second underscore
    return key

df['Join_Key'] = df['Join_Key'].astype(str).apply(clean_join_key)

# Save to CSV for new reference file
csv_path = 'Investment Research/Investment_Research_All_DF.csv'
df.to_csv(csv_path, index=False)
print('Data saved to ' + csv_path)

# Load existing CSV from search .csv
existing_csv_path = 'Investment Research/Investment_Research_All_V2_Formatted.csv'
existing_df = pd.read_csv(existing_csv_path)

# Ensure Join_Key is a string and clean it in the existing DataFrame
existing_df['Join_Key'] = existing_df['Join_Key'].astype(str).apply(clean_join_key)

# Merge the 'Text' column from new data to the existing DataFrame based on 'Join_Key'
merged_df = existing_df.merge(df[['Join_Key', 'Text']], on='Join_Key', how='left')

# Select only the desired columns
columns_to_keep = ['Contributor', 'Date/Time', 'Date', 'Company', 'Headline', 'Text']
final_df = merged_df[columns_to_keep]

# Replace values in the "Company" column
replacements = {
    "Pioneer Natural Resources Company": "PXD",
    "Concho Resources Inc.": "CXO",
    "BP.": "BP",
    "PDC Energy, Inc.": "PDCE"
}

final_df['Company'] = final_df['Company'].replace(replacements)

# Rename the "Company" column to "Ticker"
final_df.rename(columns={"Company": "Ticker"}, inplace=True)

# Save to CSV
csv_path2 = 'Investment Research/Investment_Research_ALLV2_Final.csv'
final_df.to_csv(csv_path2, index=False)
print('Updated data saved to ' + csv_path2)

Error reading Investment Research/PDFs_All_V2/ConocoPhillips_ConocoPhillipsCompany_Feb_22,_2020.pdf: Unexpected EOF
Error reading Investment Research/PDFs_All_V2/PioneerNaturalResourcesCompany_PioneerNaturalResourcesCompany_Feb_22,_2020.pdf: Unexpected EOF
Error reading Investment Research/PDFs_All_V2/EOGResources,Inc_EOGResources,Inc_Feb_09,_2020.pdf: Unexpected EOF
Error reading Investment Research/PDFs_All_V2/ValeroEnergyCorporation_ValeroEnergyCorporation_Feb_22,_2020.pdf: Unexpected EOF
Data saved to Investment Research/Investment_Research_All_DF.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Company'] = final_df['Company'].replace(replacements)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.rename(columns={"Company": "Ticker"}, inplace=True)


Updated data saved to Investment Research/Investment_Research_ALLV2_Final.csv


In [4]:
df

Unnamed: 0,Unique_ID,Join_Key,Date,Title,Source,Text
0,02,occidentalpetroleumcorporation_may_2_2020,"May 02, 2020",OccidentalPetroleumCorporation_OccidentalPetro...,OccidentalPetroleumCorporation,"Stock Report | May 02, 2020 | NYSE Symbol: OXY..."
1,28,chevroncorporation_aug_28_2021,"Aug 28, 2021","ChevronCorporation_ChevronCorporation_Aug_28,_...",ChevronCorporation,"Stock Report | August 28, 2021 | NYSE Symbol: ..."
2,28,bpplc_oct_28_2023,"Oct 28, 2023","BPplc_BPplc_Oct_28,_2023.pdf",BPplc,"Stock Report | October 28, 2023 | NYSE Symbol:..."
3,25,marathonpetroleumcorporation_jan_25_2020,"Jan 25, 2020",MarathonPetroleumCorporation_MarathonPetroleum...,MarathonPetroleumCorporation,"Stock Report | January 25, 2020 | NYSE Symbol:..."
4,02,chevroncorporation_nov_2_2020,"Nov 02, 2020","ChevronCorporation_ChevronCorporation_Nov_02,_...",ChevronCorporation,"Stock Report | November 02, 2020 | NYSE Symbol..."
...,...,...,...,...,...,...
4836,05,shellplc_aug_5_2023,"Aug 05, 2023","Shellplc_Shellplc_Aug_05,_2023.pdf",Shellplc,"Stock Report | August 05, 2023 | NYSE Symbol: ..."
4837,27,exxonmobilcorporation_feb_27_2021,"Feb 27, 2021",ExxonMobilCorporation_ExxonMobilCorporation_Fe...,ExxonMobilCorporation,"Stock Report | February 27, 2021 | NYSE Symbol..."
4838,09,marathonoilcorporation_jan_9_2022,"Jan 09, 2022",MarathonOilCorporation_MarathonOilCorporation_...,MarathonOilCorporation,"Stock Report | January 08, 2022 | NYSE Symbol:..."
4839,22,energytransitionclimatealignmentaproxyfortrans...,"Apr 22, 2024","BPplc_EnergyTransition-ClimateAlignment,aproxy...",BPplc,PARIS\nMALIGNED\nII\nClimate alignment assessm...


In [5]:
merged_df

Unnamed: 0,Contributor,Analyst,Date/Time,Company,Headline,Pages,Date,Headline2,Month,Day,Year,Join_Key,Text
0,CFRA Equity Research,"Handshoe, Jonnathan",5/16/24 22:10,MRO,Marathon Oil Corporation,9,2024-05-16,MarathonOilCorporation,May,16,2024,marathonoilcorporation_may_16_2024,"Stock Report | May 16, 2024 | NYSESymbol: MRO ..."
1,CFRA Equity Research,"Glickman, Stewart",5/14/24 22:02,EOG,"EOG Resources, Inc.",9,2024-05-14,"EOGResources,Inc.",May,14,2024,eogresourcesinc_may_14_2024,"Stock Report | May 14, 2024 | NYSESymbol: EOG ..."
2,CFRA Equity Research,"Glickman, Stewart",5/11/24 18:08,EOG,"EOG Resources, Inc.",9,2024-05-11,"EOGResources,Inc.",May,11,2024,eogresourcesinc_may_11_2024,"Stock Report | May 11, 2024 | NYSESymbol: EOG ..."
3,CFRA Equity Research,"Glickman, Stewart",5/11/24 17:57,DVN,Devon Energy Corporation,9,2024-05-11,DevonEnergyCorporation,May,11,2024,devonenergycorporation_may_11_2024,"Stock Report | May 11, 2024 | NYSESymbol: DVN ..."
4,CFRA Equity Research,"Glickman, Stewart",5/7/24 22:03,COP,ConocoPhillips,9,2024-05-07,ConocoPhillips,May,7,2024,conocophillips_may_7_2024,"Stock Report | May 07, 2024 | NYSESymbol: COP ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4891,Argus Research Company,"Selesky, William V.",2/14/19 7:21,PSX,Reaffirming BUY but lowering price target to $116,5,2019-02-14,ReaffirmingBUYbutloweringpricetargetto$116,Feb,14,2019,reaffirmingbuybutloweringpricetargetto116_feb_...,"NYSE: PSX\nPHILLIPS 66\nReport created Feb 13,..."
4892,Argus Research Company,"Selesky, William V.",2/13/19 13:27,SHEL,Reaffirming BUY and target price of $80,5,2019-02-13,ReaffirmingBUYandtargetpriceof$80,Feb,13,2019,reaffirmingbuyandtargetpriceof80_feb_13_2019,NYSE: RDS/A\nROYAL DUTCH SHELL PLC\nReport cre...
4893,Argus Research Company,"Selesky, William V.",2/6/19 7:37,XOM,Strong 4Q18; reaffirming $104 target,5,2019-02-06,Strong4Q18;reaffirming$104target,Feb,6,2019,strong4q18reaffirming104target_feb_6_2019,NYSE: XOM\nEXXON MOBIL CORP\nReport created Fe...
4894,Argus Research Company,"Selesky, William V.",2/5/19 13:39,VLO,Reiterating BUY but lowering price target to $110,5,2019-02-05,ReiteratingBUYbutloweringpricetargetto$110,Feb,5,2019,reiteratingbuybutloweringpricetargetto110_feb_...,NYSE: VLO\nVALERO ENERGY CORP\nReport created ...


In [6]:
existing_df


Unnamed: 0,Contributor,Analyst,Date/Time,Company,Headline,Pages,Date,Headline2,Month,Day,Year,Join_Key
0,CFRA Equity Research,"Handshoe, Jonnathan",5/16/24 22:10,MRO,Marathon Oil Corporation,9,2024-05-16,MarathonOilCorporation,May,16,2024,marathonoilcorporation_may_16_2024
1,CFRA Equity Research,"Glickman, Stewart",5/14/24 22:02,EOG,"EOG Resources, Inc.",9,2024-05-14,"EOGResources,Inc.",May,14,2024,eogresourcesinc_may_14_2024
2,CFRA Equity Research,"Glickman, Stewart",5/11/24 18:08,EOG,"EOG Resources, Inc.",9,2024-05-11,"EOGResources,Inc.",May,11,2024,eogresourcesinc_may_11_2024
3,CFRA Equity Research,"Glickman, Stewart",5/11/24 17:57,DVN,Devon Energy Corporation,9,2024-05-11,DevonEnergyCorporation,May,11,2024,devonenergycorporation_may_11_2024
4,CFRA Equity Research,"Glickman, Stewart",5/7/24 22:03,COP,ConocoPhillips,9,2024-05-07,ConocoPhillips,May,7,2024,conocophillips_may_7_2024
...,...,...,...,...,...,...,...,...,...,...,...,...
4887,Argus Research Company,"Selesky, William V.",2/14/19 7:21,PSX,Reaffirming BUY but lowering price target to $116,5,2019-02-14,ReaffirmingBUYbutloweringpricetargetto$116,Feb,14,2019,reaffirmingbuybutloweringpricetargetto116_feb_...
4888,Argus Research Company,"Selesky, William V.",2/13/19 13:27,SHEL,Reaffirming BUY and target price of $80,5,2019-02-13,ReaffirmingBUYandtargetpriceof$80,Feb,13,2019,reaffirmingbuyandtargetpriceof80_feb_13_2019
4889,Argus Research Company,"Selesky, William V.",2/6/19 7:37,XOM,Strong 4Q18; reaffirming $104 target,5,2019-02-06,Strong4Q18;reaffirming$104target,Feb,6,2019,strong4q18reaffirming104target_feb_6_2019
4890,Argus Research Company,"Selesky, William V.",2/5/19 13:39,VLO,Reiterating BUY but lowering price target to $110,5,2019-02-05,ReiteratingBUYbutloweringpricetargetto$110,Feb,5,2019,reiteratingbuybutloweringpricetargetto110_feb_...


In [10]:
# Load the CSV file
csv_path2 = 'Investment Research/Investment_Research_ALLV2_Final.csv'
df = pd.read_csv(csv_path2)

# Function to truncate text at "Glossary"
def truncate_text(text):
    if isinstance(text, str):
        glossary_index = text.find("Glossary")
        if glossary_index != -1:
            return text[:glossary_index]
    return text

# Apply the function to rows where "Contributor" contains "CFRA"
df.loc[df['Contributor'].str.contains('CFRA', na=False), 'Text'] = df.loc[df['Contributor'].str.contains('CFRA', na=False), 'Text'].apply(truncate_text)

# Save the modified dataframe to a new CSV file
new_csv_path = csv_path2.replace('.csv', '_Trimmed.csv')
df.to_csv(new_csv_path, index=False)

print("File saved as " + new_csv_path)

File saved as Investment Research/Investment_Research_ALLV2_Final_Trimmed.csv


In [11]:
# Load the CSV file
csv_path2 = 'Investment Research/Investment_Research_ALLV2_Final_Trimmed.csv'
df = pd.read_csv(csv_path2)

# Function to find the nearest closing quote around the midpoint
def find_nearest_closing_quote(df, midpoint):
    # Search for the closing quote in the "Text" column
    closing_quotes = df['Text'].str.endswith('"', na=False)
    
    # Find the index of the nearest closing quote before or after the midpoint
    before_midpoint = closing_quotes[:midpoint][::-1].idxmax()
    after_midpoint = closing_quotes[midpoint:].idxmax() + midpoint
    
    # Choose the nearest one
    if midpoint - before_midpoint <= after_midpoint - midpoint:
        return before_midpoint + 1  # +1 to include the closing quote line
    else:
        return after_midpoint + 1  # +1 to include the closing quote line

# Calculate the midpoint of the dataframe
midpoint = len(df) // 2

# Find the nearest closing quote around the midpoint
split_point = find_nearest_closing_quote(df, midpoint)

# Split the dataframe into two halves
df1 = df.iloc[:split_point]
df2 = df.iloc[split_point:]

# Save each half to a new CSV file
csv_path1 = csv_path2.replace('.csv', '_1.csv')
csv_path2 = csv_path2.replace('.csv', '_2.csv')

df1.to_csv(csv_path1, index=False)
df2.to_csv(csv_path2, index=False)

print("Files saved as " + csv_path1 + " and " + csv_path2)

Files saved as Investment Research/Investment_Research_ALLV2_Final_Trimmed_1.csv and Investment Research/Investment_Research_ALLV2_Final_Trimmed_2.csv
