In [1]:
!pip install google-cloud-storage pdfplumber nltk langdetect pandas

Collecting pdfplumber
  Downloading pdfplumber-0.10.3-py3-none-any.whl.metadata (38 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
      --------------------------------------- 0.0/1.5 MB 640.0 kB/s eta 0:00:03
     - -------------------------------------- 0.0/1.5 MB 487.6 kB/s eta 0:00:04
     -- ------------------------------------- 0.1/1.5 MB 762.6 kB/s eta 0:00:02
     --- ------------------------------------ 0.1/1.5 MB 717.5 kB/s eta 0:00:02
     ---- ----------------------------------- 0.2/1.5 MB 817.0 kB/s eta 0:00:02
     ----- ---------------------------------- 0.2/1.5 MB 915.1 kB/s eta 0:00:02
     --------- ------------------------------ 0.4/1.5 MB 1.4 MB/s eta 0:00:01
     --------- ------------------------------ 0.4/1.5 MB 1.4 MB/s eta 0:00:01
     --------- ------------------------------ 0.4/1.5 MB 1.1 MB/s e

In [28]:
import os
from dotenv import load_dotenv, find_dotenv
import pdfplumber
from google.cloud import storage
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from langdetect import detect
from nltk.probability import FreqDist
from collections import Counter

load_dotenv(find_dotenv())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\koush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
SERVICE_ACCOUNT_KEY = os.getenv("GCS_KEY_PATH")

GCP_PROJECT_ID = os.getenv("GCP_PROJECT_ID")

GCS_BUCKET_NAME = os.getenv("GCS_BUCKET_NAME")

WATCH_DIRECTORY = os.getenv("WATCH_DIRECTORY")

In [7]:
def download_pdfs(bucket_name, destination_folder):
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    blobs = bucket.list_blobs(prefix="raw_data/")  # Assuming your PDFs are under 'raw_data/' directory
    for blob in blobs:
        if blob.name.endswith('.pdf'):
            destination_file_name = f"{destination_folder}/{blob.name.split('/')[-1]}"
            blob.download_to_filename(destination_file_name)
            print(f"Downloaded {blob.name} to {destination_file_name}")

# Usage
# download_pdfs(GCS_BUCKET_NAME, 'local-folder')

# Text Extraction

In [9]:
def extract_text_from_pdfs(folder_path):
    texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with pdfplumber.open(file_path) as pdf:
                full_text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text()])
            texts[filename] = full_text
    return texts

# Usage
texts = extract_text_from_pdfs(WATCH_DIRECTORY)


In [10]:
texts

{'Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf': 'Office of Employee Relations\nExempt Officers’ and Sergeants’ Modified Duty Program\nBACKGROUND\n1. The City and San Jose Police Officers’ Association (SJPOA) recognize that, despite best\nefforts to promote safety, police officers and sergeants are injured in the line of duty. Such\ninjuries are unfortunate but can be a consequence of police work. The Exempt Officers’\nand Sergeants’ Modified Duty Program (“Program”) is available to any police officer or\nsergeant that has work-related or non-work related injuries or illnesses which preclude\nhim or her from performing the full scope of his or her duties without accommodation.\n2. The City and SJPOA recognize that police officers and sergeants exist to enforce the law\nand protect public safety. Some essential job duties of a police officer and sergeant\ninclude, but are not limited to, engaging in physically strenuous tasks, such as making\nforcible arrests, pu

In [14]:
# Convert the dictionary to a DataFrame
df_texts = pd.DataFrame(list(texts.items()), columns=['Filename', 'Text'])

# Display the DataFrame
print(df_texts)

                                             Filename  \
0   Article 39 - Exempt Officers and Sergeants Mod...   
1   Association of Building Mechanical and Electri...   
2   Association of Engineers and Architects IFPTE ...   
3   Association of Engineers and Architects IFTPE ...   
4   Association of Legal Professionals of San Jose...   
5   Association of Maintenance Supervisory Personn...   
6   City Association of Management Personnel IFPTE...   
7   International Brotherhood of Electrical Worker...   
8   Municipal Employees Federation (MEF) AFSCME Lo...   
9   Municipal Employees Federation (MEF) Part Time...   
10                       Officers Transfer Policy.pdf   
11                         Overtime Staffing Plan.pdf   
12                                 POPRA 2023 MOA.pdf   
13            Promotional Memorandum of Agreement.pdf   
14  San Jos Fire Fighters - International Associat...   
15  San Jose Police Officers Association (POA) MOA...   
16                      Sergean

# Descriptive Statistics

In [16]:
def descriptive_statistics(texts):
    stats = []
    for filename, text in texts.items():
        word_count = len(word_tokenize(text))
        stats.append({'filename': filename, 'word_count': word_count})
    return stats

# Usage
stats = descriptive_statistics(texts)


In [17]:
stats

[{'filename': 'Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf',
  'word_count': 3381},
 {'filename': 'Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf',
  'word_count': 19181},
 {'filename': 'Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf',
  'word_count': 17633},
 {'filename': 'Association of Engineers and Architects IFTPE Local 21 Units 4142 MOA.pdf',
  'word_count': 17930},
 {'filename': 'Association of Legal Professionals of San Jose (ALP).pdf',
  'word_count': 12159},
 {'filename': 'Association of Maintenance Supervisory Personnel IFPTE Local 21 (AMSP) MOA.pdf',
  'word_count': 15324},
 {'filename': 'City Association of Management Personnel IFPTE Local 21 (CAMP) MOA.pdf',
  'word_count': 15521},
 {'filename': 'International Brotherhood of Electrical Workers (IBEW) Local No. 332 MOA.pdf',
  'word_count': 21843},
 {'filename': 'Municipal Employees Federation (MEF) AFSCME Local 101 MOA.pdf',
  'word_count': 401

In [27]:
def extract_text_and_analyze(folder_path):
    analysis_report = []
    total_words_across_documents = 0
    total_documents = 0
    total_pages = 0

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with pdfplumber.open(file_path) as pdf:
                total_pages_in_document = len(pdf.pages)
                total_words_in_document = 0

                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        words = word_tokenize(text)
                        word_count = len(words)
                        total_words_in_document += word_count
                        total_pages += 1

                total_documents += 1
                total_words_across_documents += total_words_in_document
                analysis_report.append({
                    'filename': filename,
                    'total_words': total_words_in_document,
                    'average_words_per_page': total_words_in_document / total_pages_in_document
                })

    overall_average_words_per_document = total_words_across_documents / total_documents
    overall_average_words_per_page = total_words_across_documents / total_pages

    return analysis_report, overall_average_words_per_document, overall_average_words_per_page

# Usage
folder_path = WATCH_DIRECTORY
report, avg_words_doc, avg_words_page = extract_text_and_analyze(folder_path)

# Print overall metrics
print(f'Average Words per Document: {avg_words_doc}')
print(f'Average Words per Page: {avg_words_page}')

Average Words per Document: 15971.578947368422
Average Words per Page: 448.905325443787


# Language Detection and Standardization

In [19]:
def detect_languages(texts):
    language_stats = []
    for filename, text in texts.items():
        try:
            language = detect(text)
        except:
            language = "Error"
        language_stats.append({'filename': filename, 'language': language})
    return language_stats

# Usage
languages = detect_languages(texts)

In [20]:
languages

[{'filename': 'Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf',
  'language': 'en'},
 {'filename': 'Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf',
  'language': 'en'},
 {'filename': 'Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf',
  'language': 'en'},
 {'filename': 'Association of Engineers and Architects IFTPE Local 21 Units 4142 MOA.pdf',
  'language': 'en'},
 {'filename': 'Association of Legal Professionals of San Jose (ALP).pdf',
  'language': 'en'},
 {'filename': 'Association of Maintenance Supervisory Personnel IFPTE Local 21 (AMSP) MOA.pdf',
  'language': 'en'},
 {'filename': 'City Association of Management Personnel IFPTE Local 21 (CAMP) MOA.pdf',
  'language': 'en'},
 {'filename': 'International Brotherhood of Electrical Workers (IBEW) Local No. 332 MOA.pdf',
  'language': 'en'},
 {'filename': 'Municipal Employees Federation (MEF) AFSCME Local 101 MOA.pdf',
  'language': 'en'},
 {'filename': 'Munici

# Data Quality Report

In [21]:
def extract_text_with_quality_check(folder_path):
    quality_report = []
    extracted_texts = {}

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            try:
                with pdfplumber.open(file_path) as pdf:
                    full_text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text()])
                extracted_texts[filename] = full_text
                quality_report.append({'filename': filename, 'status': 'Success', 'error': None})
            except Exception as e:
                quality_report.append({'filename': filename, 'status': 'Failed', 'error': str(e)})

    return extracted_texts, quality_report

# Usage
folder_path = WATCH_DIRECTORY
texts, report = extract_text_with_quality_check(folder_path)

In [22]:
report

[{'filename': 'Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'Association of Engineers and Architects IFTPE Local 21 Units 4142 MOA.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'Association of Legal Professionals of San Jose (ALP).pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'Association of Maintenance Supervisory Personnel IFPTE Local 21 (AMSP) MOA.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'City Association of Management Personnel IFPTE Local 21 (CAMP) MOA.pdf',
  'status': 'Success',
  'error': None},
 {'filename': 'International Brotherhood of Electrical Workers (IBEW) Local No. 332 MOA.pdf',
  

# Preliminary Text Analysis

In [24]:
def preliminary_text_analysis(texts):
    all_words = []
    for text in texts.values():
        all_words.extend(word_tokenize(text))
    
    fdist = FreqDist(all_words)
    most_common_words = fdist.most_common(50)  # adjust the number as needed
    return most_common_words

# Usage
common_words = preliminary_text_analysis(texts)

In [25]:
common_words

[('the', 17598),
 (',', 11865),
 ('of', 9821),
 ('.', 7900),
 ('to', 7878),
 ('and', 4620),
 ('or', 4415),
 ('employee', 4303),
 ('be', 4205),
 ('in', 3995),
 ('shall', 3985),
 ('a', 3612),
 ('for', 3247),
 (')', 2983),
 ('(', 2839),
 ('City', 2685),
 ('is', 2121),
 ('by', 1919),
 ('$', 1884),
 ('The', 1870),
 ('as', 1603),
 ('any', 1592),
 ('on', 1495),
 ('an', 1457),
 ('not', 1437),
 ('leave', 1364),
 ('that', 1340),
 ('may', 1327),
 ('with', 1322),
 ('will', 1313),
 ('such', 1279),
 ('hours', 1268),
 ('time', 1258),
 ('this', 1176),
 ('employees', 1170),
 ('pay', 1052),
 ('work', 1042),
 ('1', 1033),
 ('’', 936),
 ('from', 931),
 ('June', 918),
 ('s', 918),
 ('30', 901),
 ('Union', 899),
 ('at', 879),
 ('Employee', 848),
 ('are', 816),
 ('period', 779),
 ('–', 771),
 ('which', 752)]

# Basic Tokenization and Analysis

In [29]:
def basic_nlp_analysis(texts):
    all_tokens = []
    document_tokens = {}

    for filename, text in texts.items():
        # Tokenize text
        tokens = word_tokenize(text)
        all_tokens.extend(tokens)
        document_tokens[filename] = tokens

    # Frequency distribution of all tokens
    freq_dist = FreqDist(all_tokens)

    # Find most common tokens
    most_common_tokens = freq_dist.most_common(20)  # Adjust the number as needed

    # Document-specific analysis (optional)
    document_specific_stats = {}
    for filename, tokens in document_tokens.items():
        token_count = len(tokens)
        unique_tokens = len(set(tokens))
        most_common_in_document = Counter(tokens).most_common(5)  # Adjust as needed
        document_specific_stats[filename] = {
            'total_tokens': token_count,
            'unique_tokens': unique_tokens,
            'most_common_tokens': most_common_in_document
        }

    return most_common_tokens, document_specific_stats

# Usage
# Assuming 'texts' is your dictionary with filenames as keys and extracted text as values
most_common_tokens, document_stats = basic_nlp_analysis(texts)

# Output examples
print("Most Common Tokens:", most_common_tokens)
for doc, stats in document_stats.items():
    print(f"Stats for {doc}: Total Tokens - {stats['total_tokens']}, Unique Tokens - {stats['unique_tokens']}, Most Common Tokens - {stats['most_common_tokens']}")

Most Common Tokens: [('the', 17598), (',', 11865), ('of', 9821), ('.', 7900), ('to', 7878), ('and', 4620), ('or', 4415), ('employee', 4303), ('be', 4205), ('in', 3995), ('shall', 3985), ('a', 3612), ('for', 3247), (')', 2983), ('(', 2839), ('City', 2685), ('is', 2121), ('by', 1919), ('$', 1884), ('The', 1870)]
Stats for Article 39 - Exempt Officers and Sergeants Modified Duty Program.pdf: Total Tokens - 3381, Unique Tokens - 681, Most Common Tokens - [('the', 226), (',', 133), ('of', 130), ('.', 117), ('and', 103)]
Stats for Association of Building Mechanical and Electrical Inspectors (ABMEI) MOA.pdf: Total Tokens - 19181, Unique Tokens - 2652, Most Common Tokens - [('the', 1179), (',', 804), ('of', 657), ('to', 480), ('.', 472)]
Stats for Association of Engineers and Architects IFPTE Local 21 Unit 43 MOA.pdf: Total Tokens - 17633, Unique Tokens - 2533, Most Common Tokens - [('the', 1120), (',', 741), ('of', 606), ('to', 474), ('.', 436)]
Stats for Association of Engineers and Architec