In [2]:
!pip install beautifulsoup4 PyPDF2 nltk spacy
!python -m spacy download en_core_web_sm

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
import sys
import math
import bs4 as bs
import urllib.request
import re
import PyPDF2
import nltk
from nltk.stem import WordNetLemmatizer
import spacy

nltk.download('wordnet')

nlp = spacy.load('en_core_web_sm')
lemmatizer = WordNetLemmatizer()

def file_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read().replace("\n", ' ')
        return text

def pdf_reader(pdf_path):
    with open(pdf_path, 'rb') as pdf_file_object:
        pdf_reader = PyPDF2.PdfReader(pdf_file_object)
        count = len(pdf_reader.pages)
        print("\nTotal Pages in pdf = ", count)

        c = input("Do you want to read the entire PDF? [Y]/N: ")
        if c.lower() == 'n':
            start_page = int(input("Enter start page number (Indexing starts from 0): "))
            end_page = int(input(f"Enter end page number (Less than {count}): "))

            if start_page < 0 or start_page >= count:
                print("\nInvalid Start page given")
                sys.exit()

            if end_page < 0 or end_page >= count:
                print("\nInvalid End page given")
                sys.exit()
        else:
            start_page = 0
            end_page = count - 1

        text = ""
        for i in range(start_page, end_page + 1):
            page = pdf_reader.pages[i]
            text += page.extract_text() + " "

        return text

def url_text(url):
    scrap_data = urllib.request.urlopen(url)
    article = scrap_data.read()
    parsed_article = bs.BeautifulSoup(article, 'lxml')
    paragraphs = parsed_article.find_all('p')
    article_text = " ".join(p.text for p in paragraphs)
    article_text = re.sub(r'\[[0-9]*\]', '', article_text)
    return article_text

input_text_type = int(input("Select one way of inputting your text: \
\n1. Type your Text(or Copy-Paste)\n2. Load from .txt file\n3. Load from .pdf file\n4. From a URL\n\n"))

if input_text_type == 1:
    text = input("Enter your text: \n\n")

elif input_text_type == 2:
    txt_path = input("Enter file path: ")
    text = file_text(txt_path)

elif input_text_type == 3:
    file_path = input("Enter file path: ")
    text = pdf_reader(file_path)

elif input_text_type == 4:
    wiki_url = input("Enter Wikipedia URL to load Article: ")
    text = url_text(wiki_url)

else:
    print("Sorry! Wrong Input, Try Again.")
    sys.exit()

import re

def parse_prescription(prescription_text):
    prescription_dict = {}
    prescription_text = prescription_text.text
    lines = prescription_text.splitlines()
    medication_section = False
    medication = ""
    dosage = ""

    for line in lines:
        if "Medication" in line:
            medication_section = True
            continue

        if "Dosage" in line:
            medication_section = False
            dosage = line.split("Dosage:")[-1].strip()
            prescription_dict[medication] = dosage
            medication = ""
            dosage = ""
            continue

        if medication_section:
            medication += line.strip() + " "

    for med, dose in prescription_dict.items():
        print(f"Medication: {med.strip()} \nDosage: {dose.strip()}")

    return prescription_dict

def frequency_matrix(sentences):
    freq_matrix = {}
    stop_words = nlp.Defaults.stop_words

    for sent in sentences:
        freq_table = {}
        words = [word.text.lower() for word in sent if word.text.isalnum()]

        for word in words:
            word = lemmatizer.lemmatize(word)
            if word not in stop_words:
                if word in freq_table:
                    freq_table[word] += 1
                else:
                    freq_table[word] = 1

        freq_matrix[sent.text[:15]] = freq_table

    return freq_matrix

def tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, freq_table in freq_matrix.items():
        tf_table = {}
        total_words_in_sentence = sum(freq_table.values())
        for word, count in freq_table.items():
            tf_table[word] = count / total_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

def sentences_per_words(freq_matrix):
    sent_per_words = {}

    for sent, f_table in freq_matrix.items():
        for word in f_table.keys():
            sent_per_words[word] = sent_per_words.get(word, 0) + 1

    return sent_per_words

def idf_matrix(freq_matrix, sent_per_words, total_sentences):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}
        for word in f_table.keys():
            idf_table[word] = math.log10(total_sentences / float(sent_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

def tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
        tf_idf_table = {}
        for word in f_table1.keys():
            tf_idf_table[word] = float(f_table1[word] * f_table2[word])

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

def score_sentences(tf_idf_matrix):
    sentence_score = {}

    for sent, f_table in tf_idf_matrix.items():
        total_tfidf_score_per_sentence = sum(f_table.values())
        total_words_in_sentence = len(f_table)

        if total_words_in_sentence != 0:
            sentence_score[sent] = total_tfidf_score_per_sentence / total_words_in_sentence

    return sentence_score

def average_score(sentence_score):
    total_score = sum(sentence_score.values())
    return total_score / len(sentence_score) if sentence_score else 0

def create_summary(sentences, sentence_score, threshold):
    summary = ''
    for sentence in sentences:
        if sentence.text[:15] in sentence_score and sentence_score[sentence.text[:15]] >= threshold:
            summary += " " + sentence.text
    return summary

original_words = text.split()
original_words = [w for w in original_words if w.isalnum()]
num_words_in_original_text = len(original_words)

text = nlp(text)

sentences = list(text.sents)
total_sentences = len(sentences)

freq_matrix = frequency_matrix(sentences)

tf_matrix = tf_matrix(freq_matrix)

num_sent_per_words = sentences_per_words(freq_matrix)

idf_matrix = idf_matrix(freq_matrix, num_sent_per_words, total_sentences)

tf_idf_matrix = tf_idf_matrix(tf_matrix, idf_matrix)

sentence_scores = score_sentences(tf_idf_matrix)

threshold = average_score(sentence_scores)

summary = create_summary(sentences, sentence_scores, 1.3 * threshold)

result = parse_prescription(text)

print("\n\n")
print("*" * 20, "Summary", "*" * 20)
print("\n")
print(summary)
print("\n\n")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Select one way of inputting your text: 
1. Type your Text(or Copy-Paste)
2. Load from .txt file
3. Load from .pdf file
4. From a URL

1
Enter your text: 

Patient Name: John Doe Date: October 1, 2024 Prescriber: Dr. Jane Smith, MD NPI Number: 1234567890  Medication:  Lisinopril 10 mg  Dosage: Take one tablet by mouth once daily. Refills: 2 Metformin 500 mg  Dosage: Take one tablet by mouth twice daily with meals. Refills: 3 Atorvastatin 20 mg  Dosage: Take one tablet by mouth every evening. Refills: 1 Instructions:  Monitor blood pressure regularly. Check blood sugar levels as instructed. Follow up in 3 months for medication review. Additional Notes:  Avoid high potassium foods while on Lisinopril. Report any unusual side effects or symptoms to the doctor immediately.



******************** Summary ********************


 Check blood sugar levels as instructed. Follow up in 3 months for medication review. Report any unusual side effects or symptoms to the doctor immediately.



