<a href="https://colab.research.google.com/github/MK316/Fall2023/blob/main/Pdf_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. PDF to text

In [None]:
!pip install PyPDF2

+ File to upload from your computer

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

+ Convert pdf => text
+ Save text as 'mytext.txt'



In [None]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        # Create a PDF reader
        reader = PyPDF2.PdfReader(file)

        text = ""

        # Loop through all the pages and extract text
        for page in reader.pages:
            text += page.extract_text()

    return text

pdf_filename = list(uploaded.keys())[0]  # gets the name of the first uploaded file
pdf_text = extract_text_from_pdf(pdf_filename)

# Save the extracted text to "mytext.txt" in the Colab folder
with open("mytext.txt", "w") as file:
    file.write(pdf_text)

len(pdf_text)


# Word Frequency using **NLTK**

+ Install and import packages

In [None]:
!pip install nltk

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.text import Text
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')

+ Read text 'mytext.txt'

In [None]:
with open('mytext.txt', 'r') as file:
    text = file.read()

+ The following regular expression ^[a-zA-Z]+$ ensures that only tokens with one or more alphabetical characters are included, effectively excluding punctuations, numbers, and other non-words.

In [None]:
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenize the text
tokens = nltk.word_tokenize(text)

# Remove punctuation, numbers, non-words, and English stop words
filtered_tokens = [token for token in tokens if token.lower() not in stop_words and re.match('^[a-zA-Z]+$', token)]


+ Frequency list to csv file 'frequency_table.csv'

In [None]:
# Generate frequency distribution
fdist = FreqDist(filtered_tokens)

# Convert frequency distribution to dataframe and save to CSV
freq_df = pd.DataFrame(fdist.items(), columns=['Word', 'Frequency'])
freq_df.to_csv('frequency_table.csv', index=False)


+ Concordance with a keyword (user input)

In [None]:
keyword = input("Enter the keyword: ")

# Create a Text object
text_obj = Text(tokens)

# Display concordance
text_obj.concordance(keyword, width=80, lines=20)  # width=80 ensures we see roughly 10 words before and after the keyword


# [2] Frequency list with page number information

## [1] Installation

In [None]:
!pip install nltk PyPDF2

import nltk
import re
import PyPDF2
from nltk.corpus import stopwords
from google.colab import files
import pandas as pd

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')  # for POS tagging
nltk.download('words')  # New addition for checking English words
nltk.download('punkt')
from nltk.corpus import words  # New addition

In [None]:
# Upload PDF
uploaded = files.upload()
pdf_filename = list(uploaded.keys())[0]

## [2] Extract Text from PDF and Track Pages

Now we'll extract text from the PDF and track on which page(s) a token appears.

In [None]:
def extract_text_with_page_numbers(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        text_per_page = []
        for page_num, page in enumerate(reader.pages, start=1):  # Starting from 1 to match with typical PDF pagination
            adjusted_page_num = page_num - 11  # Adjusting the page number
            if adjusted_page_num > 0:  # We only consider pages after the content begins
                text_per_page.append((adjusted_page_num, page.extract_text()))

    return text_per_page

## [3] Process Text: Remove Stopwords, Punctuations, Single Characters, and Proper Nouns

In [None]:
english_words = set(words.words())  # Create a set of English words

In [None]:
text_with_pages = extract_text_with_page_numbers(pdf_filename)

all_tokens_with_pages = []
for page_num, text in text_with_pages:
    all_tokens_with_pages.extend(process_text(text, page_num))

# Group by tokens and collate page numbers
token_to_pages = {}
for token, page_num in all_tokens_with_pages:
    if token not in token_to_pages:
        token_to_pages[token] = set()
    token_to_pages[token].add(page_num)

# Convert to frequency list
freq_list = [(token, len(pages), ','.join(map(str, sorted(list(pages))))) for token, pages in token_to_pages.items()]

freq_list.sort(key=lambda x: x[1], reverse=True)  # sort by frequency


+ Remove proper nouns, nonwords, words with less than 2 letters

In [None]:
def process_text(text, page_num):
    tokens = nltk.word_tokenize(text)
    tokens_lower = [token.lower() for token in tokens]  # Convert all tokens to lowercase right away

    tagged = nltk.pos_tag(tokens_lower)  # Tag the lowercase tokens

    filtered_tokens = [token for token, tag in zip(tokens_lower, tagged)
                       if token not in stop_words
                       and re.match('^[a-zA-Z]+$', token)
                       and len(token) > 2
                       and token in english_words  # Check if it's an English word
                       and tag not in ['NNP', 'NNPS']]

    return [(token, page_num) for token in filtered_tokens]


+ Write the list as csv 'freq.csv'

In [None]:
df = pd.DataFrame(freq_list, columns=["Token", "Frequency", "Page Numbers"])
df.to_csv('freq.csv', index=False)

+ Download csv file

In [None]:
files.download('freq.csv')

# 3. Word list by character length

## [1] Display words with length condition

In [None]:
wordlist = filtered_tokens
len(wordlist)

In [None]:
desired_length = int(input("Enter the desired token length: "))

matching_tokens = [token for token in tokens if len(token) == desired_length]

print(f"Tokens of length {desired_length}:")
print(matching_tokens)


## [2] Longest to shortest: word frequency list

In [None]:
text_with_pages = extract_text_with_page_numbers(pdf_filename)

all_tokens_with_pages = []
for page_num, text in text_with_pages:
    all_tokens_with_pages.extend(process_text(text, page_num))

# Group by tokens and collate page numbers
token_to_pages = {}
for token, page_num in all_tokens_with_pages:
    if token not in token_to_pages:
        token_to_pages[token] = set()
    token_to_pages[token].add(page_num)

# Convert to frequency list
freq_list = [(token, len(pages), sorted(list(pages))) for token, pages in token_to_pages.items()]

# Sort by token length
freq_list.sort(key=lambda x: len(x[0]), reverse=True)


In [None]:
import csv
from google.colab import files

# Save the freq_list to a CSV file
with open('freq_list.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header
    csv_writer.writerow(["Token", "Number of Letters", "Frequency", "Page Numbers"])

    # Write the data
    for row in freq_list:
        token, freq, page_nums = row
        num_letters = len(token)  # Calculate the number of letters for each token
        csv_writer.writerow([token, num_letters, freq, ', '.join(map(str, page_nums))])

# If you want to download the file to your computer
files.download('freq_list.csv')


In [None]:
# Download the file to your computer
files.download('freq_list.csv')
