In [3]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(pdf_path):
    pdf = PdfReader(pdf_path)
    text = ''
    for page in pdf.pages:
        text += page.extract_text()
    return text

In [49]:
import re

def process_pdf_text(text, existing_cefr_levels):
    lines = text.split('\n')
    start_index = next((i for i, line in enumerate(lines) if re.match(r'^[A-C][1-2]$', line)), 0)
    for line in lines[start_index:]:
        if re.match(r'^[A-C][1-2]$', line):
            current_level = line
        elif line:
            parts = line.split(' ', 1)
            if len(parts) == 2:
                word, pos = parts
                if word and pos and word.lower() not in existing_cefr_levels:
                    existing_cefr_levels[word.lower()] = (current_level, pos)

    # Convert dictionary to list, delete first 6 items, and convert back to dictionary
    existing_cefr_levels = dict(list(existing_cefr_levels.items())[6:])

    return existing_cefr_levels



In [56]:
pdf_paths = ['American_Oxford_3000_by_CEFR_level.pdf', 'American_Oxford_5000_by_CEFR_level.pdf', 'The_Oxford_3000_by_CEFR_level.pdf', 'The_Oxford_5000_by_CEFR_level.pdf']  # Replace with your actual PDF paths
cefr_levels = {}
for pdf_path in pdf_paths:
    pdf_text = extract_text_from_pdf(pdf_path)
    new_cefr_levels = process_pdf_text(pdf_text,{'A1':1, 'A2':2, 'B1':3, 'B2':4, 'C1':5, 'C2':6})
    cefr_levels.update(new_cefr_levels)  # Merge the new levels into the existing dictionary


In [58]:
for key, value in cefr_levels.items():
    print(f"{key}: {value}")

a,: ('A1', 'an indefinite article')
about: ('A1', 'prep. , adv.')
above: ('A1', 'prep. , adv.')
across: ('A1', 'prep. , adv.')
action: ('A1', 'n.')
activity: ('A1', 'n.')
actor: ('A1', 'n.')
actress: ('A1', 'n.')
add: ('A1', 'v.')
address: ('A1', 'n.')
adult: ('A1', 'n.')
advice: ('A1', 'n.')
afraid: ('A1', 'adj.')
after: ('A1', 'prep.')
afternoon: ('A1', 'n.')
again: ('A1', 'adv.')
age: ('A1', 'n.')
ago: ('A1', 'adv.')
agree: ('A1', 'v.')
air: ('A1', 'n.')
airport: ('A1', 'n.')
all: ('A1', 'det. , pron.')
also: ('A1', 'adv.')
always: ('A1', 'adv.')
amazing: ('A1', 'adj.')
and: ('A1', 'conj.')
angry: ('A1', 'adj.')
animal: ('A1', 'n.')
another: ('A1', 'det./pron.')
answer: ('A1', 'n. , v.')
any: ('A1', 'det. , pron.')
anyone: ('A1', 'pron.')
anything: ('A1', 'pron.')
apartment: ('A1', 'n.')
apple: ('A1', 'n.')
april: ('A1', 'n.')
area: ('A1', 'n.')
arm: ('C1', 'v.')
around: ('A1', 'prep. , adv.')
arrive: ('A1', 'v.')
art: ('A1', 'n.')
article: ('A1', 'n.')
artist: ('A1', 'n.')
as: ('A1

In [65]:
import spacy

# Load English tokenizer, POS tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("In my free time, I enjoy exploring nature and going for long walks in the nearby park. It's a tranquil place, with tall trees casting soothing shadows and a gentle breeze rustling through the leaves. During these walks, I often bring along my camera to capture the beauty of the surroundings. I find photography to be a fascinating hobby, as it allows me to preserve memorable moments and share them with friends and family. Nature has always been a source of inspiration for me, and I am particularly drawn to vibrant sunsets and the mesmerizing patterns formed by the clouds. The park also provides an excellent opportunity to meet new people and strike up interesting conversations. I have met many like-minded individuals who share my passion for nature and photography. These encounters have enriched my experiences and broadened my horizons. Overall, spending time in the park with my camera in hand is something I cherish, as it combines my love for nature, creativity, and social interactions.")
doc = nlp(text)

# Analyze syntax
noun_phrases = [chunk.text for chunk in doc.noun_chunks]
print("Noun phrases:", noun_phrases)

verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
print("Verbs:", verbs)

# Named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

cefr_levels_for_text = [cefr_levels[word.lemma_.lower()] for word in doc if word.lemma_.lower() in cefr_levels]

Noun phrases: ['my free time', 'I', 'nature', 'long walks', 'the nearby park', 'It', 'a tranquil place', 'tall trees', 'soothing shadows', 'a gentle breeze', 'the leaves', 'these walks', 'I', 'my camera', 'the beauty', 'the surroundings', 'I', 'photography', 'a fascinating hobby', 'it', 'me', 'memorable moments', 'them', 'friends', 'family', 'Nature', 'a source', 'inspiration', 'me', 'I', 'vibrant sunsets', 'the mesmerizing patterns', 'the clouds', 'The park', 'an excellent opportunity', 'new people', 'interesting conversations', 'I', 'many like-minded individuals', 'who', 'my passion', 'nature', 'photography', 'These encounters', 'my experiences', 'my horizons', 'spending time', 'the park', 'my camera', 'hand', 'something', 'I', 'it', 'my love', 'nature', 'creativity', 'social interactions']
Verbs: ['explore', 'go', 'cast', 'soothe', 'rustle', 'bring', 'capture', 'find', 'allow', 'preserve', 'share', 'draw', 'mesmerize', 'form', 'provide', 'meet', 'strike', 'meet', 'share', 'enrich', 

In [70]:
cefr_levels_for_text

[('A1', 'prep. , adv.'),
 ('A1', 'det.'),
 ('A1', 'adj.'),
 ('A1', 'n.'),
 ('A1', 'pron.ice n.'),
 ('A1', 'v.'),
 ('B1', 'v.'),
 ('A2', 'n.'),
 ('A1', 'conj.'),
 ('A1', 'v.'),
 ('A1', 'prep.'),
 ('A1', '1 adj. , adv.'),
 ('A1', 'v. , n.'),
 ('A1', 'prep. , adv.'),
 ('A1', 'definite article'),
 ('B2', 'adj. , adv.'),
 ('A1', 'n. , v.'),
 ('A1', 'pron.'),
 ('A1', 'v. , auxiliary v.'),
 ('B1', 'v.'),
 ('A1', 'prep.'),
 ('A1', 'adj.'),
 ('A1', 'n.'),
 ('B2', 'n. , v.'),
 ('B2', 'n.'),
 ('A1', 'conj.'),
 ('B1', 'adj.'),
 ('A1', 'prep. , adv.'),
 ('A1', 'definite article'),
 ('A1', 'v.'),
 ('A1', 'prep.'),
 ('A1', 'v. , n.'),
 ('A1', 'pron.ice n.'),
 ('A1', 'adv.'),
 ('A1', 'v.'),
 ('A2', 'prep. , adv.'),
 ('A1', 'det.'),
 ('A1', 'n.'),
 ('A1', 'prep. , infinitive marker'),
 ('B2', 'v. , n.'),
 ('A1', 'definite article'),
 ('B1', 'n.'),
 ('A1', 'prep.'),
 ('A1', 'definite article'),
 ('B2', 'adj.'),
 ('A1', 'pron.ice n.'),
 ('A1', 'v.'),
 ('B1', 'n.'),
 ('A1', 'prep. , infinitive marker'),
 

In [74]:
from collections import Counter

def get_most_common(cefr_levels):
    # Extract the CEFR levels from the tuples
    cefr_levels_only = [level for level, pos in cefr_levels]

    print(cefr_levels_only)

    # Count the occurrences of each level
    data = Counter(cefr_levels_only)

    # Return the most common level
    return data.most_common(1)[0][0]


sentence_cefr_level = get_most_common(cefr_levels_for_text)

['A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'B1', 'A2', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'B2', 'A1', 'A1', 'A1', 'B1', 'A1', 'A1', 'A1', 'B2', 'B2', 'A1', 'B1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A1', 'B2', 'A1', 'B1', 'A1', 'A1', 'B2', 'A1', 'A1', 'B1', 'A1', 'A1', 'B1', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'B2', 'B2', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'C1', 'A1', 'A1', 'A1', 'A1', 'A1', 'B1', 'A1', 'A1', 'C1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A1', 'A2', 'A2', 'A2', 'A1', 'A1', 'A1', 'A1', 'A1', 'B2', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A2', 'A1', 'A1', 'A1', 'B1', 'A1', 'A2', 'A1', 'B1', 'B2', 'A1', 'C1', 'A1', 'A2', 'A1', 'A1', 'C1', 'B2', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'A1', 'B1', 'A1', 'A1', 'A1', 'A2', 'B2', 'A1', 'A2', 'B2']


In [67]:
from rake_nltk import Rake

# Uses stopwords for english from NLTK, and all puntuation characters.
r = Rake()

# Extraction given the text.
r.extract_keywords_from_text(text)

# To get keyword phrases ranked highest to lowest.
keywords = r.get_ranked_phrases()[:5]

In [75]:
sentence_cefr_level

'A1'

In [69]:
keywords

['tall trees casting soothing shadows',
 'preserve memorable moments',
 'often bring along',
 'met many like',
 'mesmerizing patterns formed']