In [8]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
#!pip install nltk # can install on terminal or by uncommenting this line
#import nltk; nltk.download('punkt'); nltk.download('stopwords')
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer

## lda
#!pip install gensim # can install by uncommenting this line
from gensim import corpora
import gensim

## visualizing LDA--likely need to install
#!pip install pyLDAvis # can install by uncommenting this line
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook()

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random
import string; punctlist = [char for char in string.punctuation] # list of english punctuation marks

!pip install PyMuPDF
import fitz  # PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.23.6-cp39-none-macosx_10_9_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.6 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.6-py3-none-macosx_10_9_x86_64.whl.metadata (1.3 kB)
Downloading PyMuPDF-1.23.6-cp39-none-macosx_10_9_x86_64.whl (4.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.23.6-py3-none-macosx_10_9_x86_64.whl (30.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.1/30.1 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.6 PyMuPDFb-1.23.6


In [1]:
from pdfminer.high_level import extract_text

extract_text("Data/Barclays/1st indy Barclays to pay £290m penalties as Bob Diamond forgoes bonus.pdf")

'User Name: AYCEID ServicetoService\nDate and Time: Monday, May 11, 2020 7:43:00 AM EDT\nJob Number: 116541938\n\n1. Barclays to pay £290m penalties as Bob Diamond forgoes bonus\n\nDocument (1)\n\n| About LexisNexis | Privacy Policy | Terms & Conditions | Copyright © 2020 LexisNexis\n\nAYCEID ServicetoService\n\n\x0cBarclays to pay £290m penalties as Bob Diamond forgoes bonus\n\nIndependent.co.uk\n\nJune 27, 2012 Wednesday 6:18 PM GMT\n\nCopyright 2012 Independent News & Media plc All Rights Reserved\n\nSection: BUSINESS NEWS\n\nLength: 718 words\n\nByline: Peter Cripps\nBody\n\nThe banking industry was engulfed in a fresh scandal today after Barclays paid £290 million to settle claims that it \nused underhand tactics to try to rig financial markets. \n\nThe penalties from UK and US regulators, including a record £59.5 million fine from the Financial Services Authority \n(FSA),  followed  allegations  it  manipulated  Libor  and  Euribor  interbank  lending,  which  govern  the  rates 

In [9]:
def read_data(folder_path):
    data = {}
    for subdir, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            if file.endswith('.pdf'):
                with fitz.open(file_path) as pdf_document:
                    text = ""
                    for page_num in range(pdf_document.page_count):
                        page = pdf_document[page_num]
                        text += page.get_text()

                data[file] = text
            else:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data[file] = f.read()
    return data


In [4]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    return tokens

In [5]:
# Function to perform text analysis
def text_analysis(data):
    # Convert data to a pandas DataFrame
    df = pd.DataFrame(list(data.items()), columns=['File', 'Content'])

    # Preprocess text
    df['Processed_Content'] = df['Content'].apply(preprocess_text)

    # Create a dictionary and a corpus for the LDA model
    dictionary = corpora.Dictionary(df['Processed_Content'])
    corpus = [dictionary.doc2bow(text) for text in df['Processed_Content']]

    # Apply Latent Dirichlet Allocation (LDA) for topic modeling
    lda_model = gensim.models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

    # Add topics to the DataFrame
    df['Topic'] = df['Processed_Content'].apply(lambda x: lda_model[dictionary.doc2bow(x)][0][0])

    return df, lda_model, dictionary

In [6]:
# Function to visualize topics using pyLDAvis
def visualize_topics(lda_model, dictionary, df):
    vis_data = gensimvis.prepare(lda_model,
                                 df['Processed_Content'].apply(lambda x: dictionary.doc2bow(x)), dictionary)
    pyLDAvis.display(vis_data)

In [18]:
# Example usage
folder_path = '.../Data/Barclays'
data = read_data(folder_path)
print(data)
#df, lda_model, dictionary = text_analysis(data)
#visualize_topics(lda_model, dictionary, df)

{}


In [37]:
!pip install slate3k

Collecting slate3k
  Downloading slate3k-0.5.3-py2.py3-none-any.whl (7.9 kB)
Collecting pdfminer3k (from slate3k)
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ply (from pdfminer3k->slate3k)
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ply, pdfminer3k, slate3k
Successfully installed pdfminer3k-1.3.4 ply-3.11 slate3k-0.5.3


In [40]:
import os
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import slate3k as slate

# Folder path containing PDF files
folder_path = "/Users/angel_jo/Documents/GitHub/QSS20_Personal/Final Project/Data"

# Function to extract text from PDF files using slate3k
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        doc = slate.PDF(file)
        for page in doc:
            text += page.text
    return text

# Read text from PDF files in the specified folder
pdf_texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        text = extract_text_from_pdf(pdf_path)
        if text:
            pdf_texts.append(text)
            print(f"Length of text in {filename}: {len(text)}")

# Create a DataFrame from the extracted text
df = pd.DataFrame({"Text": pdf_texts})

# Check if there is non-empty text
if not df.empty:
    # Create a Document-Term Matrix (DTM)
    list_stopwords = stopwords.words("english")
    custom_words_to_add = ['bank', 'barclays', 'telegraph', 'guardian', 'independent', 'lexisnexis']
    list_stopwords_new = list_stopwords + custom_words_to_add
    
    vectorizer = CountVectorizer(stop_words=list_stopwords_new)
    dtm = vectorizer.fit_transform(df['Text'])

    # Example: Display the DTM
    print(dtm)
else:
    print("No non-empty text found in the PDFs.")


No non-empty text found in the PDFs.


In [39]:
!pip install textract

Collecting textract
  Downloading textract-1.6.5-py3-none-any.whl (23 kB)
Collecting argcomplete~=1.10.0 (from textract)
  Downloading argcomplete-1.10.3-py2.py3-none-any.whl (36 kB)
Collecting beautifulsoup4~=4.8.0 (from textract)
  Downloading beautifulsoup4-4.8.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.9/106.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting chardet==3.* (from textract)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docx2txt~=0.8 (from textract)
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting extract-msg<=0.29.* (from textract)
  Downloading extract_msg-0.28.7-py2.py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m2.5 M

In [41]:
# Folder path containing PDF files
folder_path = "/Users/angel_jo/Documents/GitHub/QSS20_Personal/Final Project/Data"

# Function to extract text from PDF files using Textract
def extract_text_from_pdf(pdf_path):
    try:
        text = textract.process(pdf_path).decode("utf-8")
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

# Read text from PDF files in the specified folder
pdf_texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        text = extract_text_from_pdf(pdf_path)
        if text:
            pdf_texts.append(text)
            print(f"Length of text in {filename}: {len(text)}")

# Create a DataFrame from the extracted text
df = pd.DataFrame({"Text": pdf_texts})

# Check if there is non-empty text
if not df.empty:
    # Create a Document-Term Matrix (DTM)
    list_stopwords = stopwords.words("english")
    custom_words_to_add = ['bank', 'barclays', 'telegraph', 'guardian', 'independent', 'lexisnexis']
    list_stopwords_new = list_stopwords + custom_words_to_add
    
    vectorizer = CountVectorizer(stop_words=list_stopwords_new)
    dtm = vectorizer.fit_transform(df['Text'])

    # Example: Display the DTM
    print(dtm)
else:
    print("No non-empty text found in the PDFs.")


No non-empty text found in the PDFs.


In [42]:
import os
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import fitz  # PyMuPDF

# Folder path containing PDF files
folder_path = "/Users/angel_jo/Documents/GitHub/QSS20_Personal/Final Project/Data"

# Function to extract text from PDF files using PyMuPDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf_doc:
        for page_num in range(pdf_doc.page_count):
            page = pdf_doc[page_num]
            text += page.get_text()
    return text

# Read text from PDF files in the specified folder
pdf_texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        text = extract_text_from_pdf(pdf_path)
        if text:
            pdf_texts.append(text)
            print(f"Length of text in {filename}: {len(text)}")

# Create a DataFrame from the extracted text
df = pd.DataFrame({"Text": pdf_texts})

# Check if there is non-empty text
if not df.empty:
    # Create a Document-Term Matrix (DTM)
    list_stopwords = stopwords.words("english")
    custom_words_to_add = ['apartment', 'new york', 'nyc', 'bronx', 'brooklyn', 'manhattan', 'queens', 'staten island']
    list_stopwords_new = list_stopwords + custom_words_to_add
    
    vectorizer = CountVectorizer(stop_words=list_stopwords_new)
    dtm = vectorizer.fit_transform(df['Text'])

    # Example: Display the DTM
    print(dtm)
else:
    print("No non-empty text found in the PDFs.")


No non-empty text found in the PDFs.


In [43]:
!pip install pdf2image pytesseract


Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.16.3 pytesseract-0.3.10


In [44]:
import os
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from pdf2image import convert_from_path
import pytesseract

# Set the path to the Tesseract executable (change accordingly)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Folder path containing PDF files
folder_path = "/Users/angel_jo/Documents/GitHub/QSS20_Personal/Final Project/Data"

# Function to extract text from PDF files using OCR
def extract_text_from_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for i, image in enumerate(images):
        text += pytesseract.image_to_string(image)
    return text

# Read text from PDF files in the specified folder
pdf_texts = []
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        text = extract_text_from_pdf(pdf_path)
        if text:
            pdf_texts.append(text)
            print(f"Length of text in {filename}: {len(text)}")

# Create a DataFrame from the extracted text
df = pd.DataFrame({"Text": pdf_texts})

# Check if there is non-empty text
if not df.empty:
    # Create a Document-Term Matrix (DTM)
    list_stopwords = stopwords.words("english")
    custom_words_to_add = ['apartment', 'new york', 'nyc', 'bronx', 'brooklyn', 'manhattan', 'queens', 'staten island']
    list_stopwords_new = list_stopwords + custom_words_to_add
    
    vectorizer = CountVectorizer(stop_words=list_stopwords_new)
    dtm = vectorizer.fit_transform(df['Text'])

    # Example: Display the DTM
    print(dtm)
else:
    print("No non-empty text found in the PDFs.")


No non-empty text found in the PDFs.


## Assuming text model and visualisation worked

In [None]:
#look at pocentage drop