Python function to read a PDF and return its text content:

In [None]:
import fitz  # PyMuPDF library

def extract_text_from_pdf(pdf_path):
    """Read a PDF file and return all text content as a single string."""
    text = ""
    with fitz.open(pdf_path) as doc:               # Open the PDF
        for page in doc:                           # Iterate through pages
            text += page.get_text("text")          # Extract text from each page
    return text

# Example usage:
pdf_path = r"C:\Users\aurakcyber4\job-skills-dashboard\docs\B.A. Degree in Mass Communication with Concentration in Digital Media.pdf"          # Path to the university catalog PDF
catalog_text = extract_text_from_pdf(pdf_path)
print(catalog_text[:500])  # Print first 500 characters for a peek



76 | P a g e  
 
 Catalog 2024-2025 
SCHOOL OF ARTS AND SCIENCES 
Bachelor of Arts in Mass Communication 
The Bachelor of Arts degree in Mass Communication provides students with a high-quality education in line 
with emerging market trends in the media industry within the UAE, the Middle East, and the world. Rapid 
advancements in the field of communication and new media have created demand for qualified 
professionals and leaders who possess the knowledge to address global issues and, by so do


load spaCy and initialize the matcher:

In [20]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")               # load English language model
matcher = PhraseMatcher(nlp.vocab)               # initialize the PhraseMatcher


list of skill terms (phrases)

In [None]:
# Example skill terms (in practice, compile this list from your domain or text analysis)
skill_terms = [
    "communication theory", 
    "critical thinking", 
    "analytical abilities",
    "communication technology",
    "research methodologies",
    "multimedia practices"
]
# Create pattern Doc objects for each term and add to the matcher
patterns = [nlp.make_doc(term) for term in skill_terms]    # create Doc for each phrase
matcher.add("SKILL_TERMS", patterns)


use the matcher on the catalog text:

In [22]:
doc = nlp(catalog_text)                      # process the entire catalog text with spaCy
matches = matcher(doc)                       # find all occurrences of the skill terms
extracted_skills = set()                     # use a set to avoid duplicates
for match_id, start, end in matches:
    span = doc[start:end]                    # the matched span of text
    extracted_skills.add(span.text.lower())  # collect the skill phrase (normalized to lowercase)
print("Skills found:", extracted_skills)


Skills found: {'digital media', 'research methodologies', 'critical thinking'}


RAKE

In [15]:
from rake_nltk import Rake

# 1. Prepare RAKE as usual (stopwords already downloaded)
rake_extractor = Rake()

# 2. Split your catalog text into simple sentences yourself
#    Here we split on periods — you can also split on '\n' or use a regex.
sentences = [s.strip() for s in catalog_text.split('.') if s]

# 3. Give RAKE those sentences directly
rake_extractor.extract_keywords_from_sentences(sentences)

# 4. Pull out the top phrases
key_phrases = rake_extractor.get_ranked_phrases()
print("Top RAKE phrases:", key_phrases[:10])



Top RAKE phrases: ['media communication 3 comm 334 broadcast journalism 3 comm 323 news reporting 3 mass communication elective 3 gen ed natural science course 3 free elective 3 mass communication elective 3 free elective 3 subtotal', 'digital culture 3 comm 224 visual storytelling 3 comm 323 news reporting 3 comm 334 broadcast journalism 3 comm 423 interactive multimedia 3 80', 'behavioral science course 3 gen ed course mathematics course 3 gen ed course natural science course 3 program core course requirements 35 credits course', 'credits comm 213 public relations writing 3 comm 334 broadcast journalism 3 comm 337 public relations cases 3 comm 344 public relations', 'digital media 3 comm 212 media writing 3 comm 215 feature writing 3 comm 222 intercultural mass communication 3 comm 223 globalization', 'description cr com 212 media writing 3 comm 215 feature writing 3 comm 222 intercultural mass communication 3 comm 223 globalization', '3 free elective 3 mass communication elective 3 

In [16]:
import re

def is_valid_phrase(phrase):
    # no digits
    if re.search(r'\d', phrase):
        return False
    # no all-caps codes (like COMM, UAES)
    if re.search(r'\b[A-Z]{2,}\b', phrase):
        return False
    # reasonable length: 1–4 words
    word_count = len(phrase.split())
    if word_count < 1 or word_count > 4:
        return False
    return True

cleaned = [p for p in key_phrases if is_valid_phrase(p)]
print("Filtered RAKE candidates:", cleaned[:20])


Filtered RAKE candidates: ['elective course', 'fifth writing intensive course', 'different mass communication fields', 'summer semester course', 'free electives', 'description cr course', 'description cr course', 'description cr course', 'description cr course', 'writing intensive course', 'mass communication provides students', 'mass communications degree requires', 'mass communication offers concentrations', 'native arabic learners course', 'native arabic learners course', 'mass communication program', 'news', 'program must meet requirements', 'concentration course requirements', 'general education component requirements']


In [17]:
import re

# Your RAKE candidates from the first filter
candidates = [
  'elective course',
  'fifth writing intensive course',
  'different mass communication fields',
  'summer semester course',
  'free electives',
  'description cr course',
  'writing intensive course',
  'mass communication provides students',
  'mass communications degree requires',
  'mass communication offers concentrations',
  'native arabic learners course',
  'mass communication program',
  'news',
  'program must meet requirements',
  'concentration course requirements',
  'general education component requirements'
]

# Define blacklist terms
blacklist = [
    'course', 'program', 'require', 'elective', 'description',
    'component', 'concentration', 'summer', 'free', 'degree', 'offers', 'provides'
]

def is_skill_candidate(phrase):
    # reject if any blacklist term is present
    if any(term in phrase for term in blacklist):
        return False
    # keep very short core words like "news"
    return True

filtered = [p for p in candidates if is_skill_candidate(p)]
print("After blacklist filter:", filtered)


After blacklist filter: ['different mass communication fields', 'news']


In [27]:
skill_terms = [
    "news reporting",
    "broadcast journalism",
    "public relations writing",
    "multimedia storytelling",
    "visual storytelling",
    "digital media",
    "interactive multimedia",
    "media ethics",
    "critical thinking",
    "research methodologies"
]


In [25]:
import re
# replace all whitespace (including newlines) with a single space
catalog_text = re.sub(r'\s+', ' ', catalog_text)


In [30]:
from spacy.matcher import PhraseMatcher
import spacy

nlp = spacy.load("en_core_web_sm")

# Create a matcher that compares the LOWER attribute of tokens
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")


# Build lowercase patterns
patterns = [nlp.make_doc(term) for term in skill_terms]
matcher.add("COURSE_SKILLS", patterns)

# Process cleaned text
doc = nlp(catalog_text)

# Extract matches
found_skills = {doc[start:end].text.lower() for _id, start, end in matcher(doc)}
print("Final extracted skills:", found_skills)


Final extracted skills: {'interactive multimedia', 'critical thinking', 'multimedia storytelling', 'visual storytelling', 'public relations writing', 'broadcast journalism', 'research methodologies', 'news reporting', 'digital media'}


In [29]:
"news reporting" in catalog_text.lower()  # should be True if it’s there

True