In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("Tidied_University_Data_Science_Programs_Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38 entries, 0 to 37
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   University Name               38 non-null     object
 1   Program Title                 38 non-null     object
 2   Program URL                   38 non-null     object
 3   Program Overview              38 non-null     object
 4   Total Credit Hours/Units      38 non-null     object
 5   Duration                      38 non-null     object
 6   Delivery Format               38 non-null     object
 7   Core Courses                  38 non-null     object
 8   Specializations/Tracks        38 non-null     object
 9   Elective Courses              38 non-null     object
 10  Capstone/Project Requirement  38 non-null     object
 11  Internship/Practicum          38 non-null     object
 12  Program Learning Outcomes     38 non-null     object
 13  Skills and Tools Empha

In [3]:
unique_delivery = df['Delivery Format'].unique()
print(unique_delivery)

['On-campus' 'Online/Hybrid' 'Not specified.'
 'On-campus in Leuven or online/blended format' 'Online'
 'On-campus, with study opportunities at partner institutions'
 'On-campus with face-to-face lectures, seminars, and tutorials'
 'Blended (face-to-face classes two days a month on Hatfield Campus)'
 'Hybrid']


In [4]:
mapping = {
    "On-campus, with study opportunities at partner institutions": "On-campus",
    "On-campus with face-to-face lectures, seminars, and tutorials": "On-campus",
    "Blended (face-to-face classes two days a month on Hatfield Campus)": "Hybrid",
    "On-campus in Leuven or online/blended format": "On-campus/Hybrid"
}

# Normalize the "Delivery Format" column in one step
df["Delivery Format"] = df["Delivery Format"].replace(mapping)

In [5]:
unique_delivery = df['Delivery Format'].unique()
print(unique_delivery)

['On-campus' 'Online/Hybrid' 'Not specified.' 'On-campus/Hybrid' 'Online'
 'Hybrid']


In [6]:
unique_tracks = df['Specializations/Tracks'].unique()
print(unique_tracks)

['Tailored electives in Data Mining, Machine Learning, Artificial Intelligence'
 'Flexibility through electives, allowing tailored studies in data science'
 'Elective options tailored to interests in computer science, mathematics, and IT'
 'Focus on computational and statistical methods for managing and analyzing large datasets.'
 'Focus on applied data science with real-world industry projects.'
 'Flexible research areas in Data Science and Artificial Intelligence'
 'Specialization in Data Science integrating statistical methods with data applications'
 'Data Analytics, Data Engineering, Data Architecture.'
 'Tracks in Biometrics, Social Sciences, Business, Industry, Theoretical Stats, Interdisciplinary, EMOS'
 'Focuses on business analytics and data-driven solutions.'
 'Specialization through elective courses in data science topics.'
 'Specializations in Data Science and other information studies domains.'
 'Focused on business analytics and data science for real-world business probl

In [8]:
%pip install ace_tools

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Define the updated mapping dictionary
tracks_mapping = {
    "Tailored electives in Data Mining, Machine Learning, Artificial Intelligence": "Data Mining, Machine Learning, AI",
    "Flexibility through electives, allowing tailored studies in data science": "Data Science",
    "Elective options tailored to interests in computer science, mathematics, and IT": "CS, Math, IT Tracks",
    "Focus on computational and statistical methods for managing and analyzing large datasets.": "Computational methods, Statistical Methods",
    "Focus on applied data science with real-world industry projects.": "Data Science",
    "Flexible research areas in Data Science and Artificial Intelligence": "Data Science, AI",
    "Specialization in Data Science integrating statistical methods with data applications": "Statistical Methods, Data Science",
    "Data Analytics, Data Engineering, Data Architecture.": "Data Analytics, Engineering, Architecture",
    "Tracks in Biometrics, Social Sciences, Business, Industry, Theoretical Stats, Interdisciplinary, EMOS": "Biometrics, Social Sciences, Business Analytics",
    "Focuses on business analytics and data-driven solutions.": "Business Analytics",
    "Specialization through elective courses in data science topics.": "Data Science",
    "Specializations in Data Science and other information studies domains.": "Data Science, Information Studies",
    "Focused on business analytics and data science for real-world business problems.": "Business Analytics",
    "Not applicable; no formal tracks.": "No Formal Tracks",
    "Interdisciplinary approach integrating economics, computer science, and data science.": "Economics, Computer Science, Data Science",
    "Specializations in advanced data science topics": "Advanced Data Science",
    "Specialization in Business Analytics, focusing on finance and data integration": "Business Analytics, Finance",
    "Focus on statistical theory and its applications in data science.": "Statistical Theory, Data Science",
    "Focus on applying data science methods to education research and practice.": "Education, Data Science",
    "Focus on quantitative and computational methods in biomedical research.": "Biomedical Research, Computational Methods",
    "Specializations in Natural Language Processing, Computer Vision, Reinforcement Learning": "NLP, Computer Vision, Reinforcement Learning",
    "Flexible tracks in Data Engineering and Analytics, tailored to specific student interests": "Data Engineering, Analytics",
    "Research areas include AI, Data Science, IoT, Robotics, Photonics, Logistics": "AI, IoT, Robotics, Logistics",
    "Flexible through elective courses tailored to specific interests in data science": "Flexible Electives, Data Science",
    "Focuses on industry applications of data science.": "Industry Applications, Data Science",
    "No formal specialization, but coursework and projects can focus on data science topics.": "No Formal Specialization",
    "Tailored towards industry applications.": "Industry Applications",
    "Focus on digital transformation and industry-specific applications.": "Digital Transformation, Industry Applications",
    "Focus on modeling complex systems and statistical methodologies.": "Complex Systems, Statistical Methods",
}

# Apply the mapping to the 'Specializations/Tracks' column
df["Specializations/Tracks"] = df["Specializations/Tracks"].replace(tracks_mapping)

# Display the cleaned DataFrame to the user

# Print unique values to verify
print(df["Specializations/Tracks"].unique())


['Data Mining, Machine Learning, AI' 'Data Science' 'CS, Math, IT Tracks'
 'Computational methods, Statistical Methods' 'Data Science, AI'
 'Statistical Methods, Data Science'
 'Data Analytics, Engineering, Architecture'
 'Biometrics, Social Sciences, Business Analytics' 'Business Analytics'
 'Data Science, Information Studies' 'No Formal Tracks'
 'Economics, Computer Science, Data Science' 'Advanced Data Science'
 'Business Analytics, Finance' 'Statistical Theory, Data Science'
 'Education, Data Science' 'Biomedical Research, Computational Methods'
 'NLP, Computer Vision, Reinforcement Learning'
 'Data Engineering, Analytics' 'AI, IoT, Robotics, Logistics'
 'Flexible Electives, Data Science' 'Industry Applications, Data Science'
 'Covers digital transformation, AI, and data management.'
 'Astronomy, Particle Physics, Bioinformatics, Finance, Economics'
 'Flexibility through elective courses in entrepreneurship, visualization, and data science'
 'Flexible through elective subjects like

In [11]:
tracks_mapping = {
     'Flexibility through elective courses in entrepreneurship, visualization, and data science': 'Entrepreneurship, Data Visualization, Data Science',
 'Flexible through elective subjects like machine learning, data visualization, and data engineering.': 'Machine Learning, Data Visualization, Data Engineering',
 'Flexible through elective subjects in areas like machine learning, data visualization, and data engineering.': 'Machine Learning, Data Visualization, Data Engineering',
 'Specialization clusters, including Public Health Data Science': 'Public Health, Data Science',
 'Option modules like Applied ML, NLP for Social Sciences, Fairness in ML, Social Network Analysis':'Machine Learning, NLP, Social Network Analysis',
 'Multidisciplinary, engaging with Computer Science, Mathematics, Statistics, etc.':'Computer Science, Mathematics, Statistics',
 'Specializations in Computer Science, Mathematical Informatics, Information Physics, Communication Engineering':'Computer Science, Mathematics, Information Physics, Communication Engineering',
}

# Apply the mapping to the 'Specializations/Tracks' column
df["Specializations/Tracks"] = df["Specializations/Tracks"].replace(tracks_mapping)

# Display the cleaned DataFrame to the user

# Print unique values to verify
print(df["Specializations/Tracks"].unique())

['Data Mining, Machine Learning, AI' 'Data Science' 'CS, Math, IT Tracks'
 'Computational methods, Statistical Methods' 'Data Science, AI'
 'Statistical Methods, Data Science'
 'Data Analytics, Engineering, Architecture'
 'Biometrics, Social Sciences, Business Analytics' 'Business Analytics'
 'Data Science, Information Studies' 'No Formal Tracks'
 'Economics, Computer Science, Data Science' 'Advanced Data Science'
 'Business Analytics, Finance' 'Statistical Theory, Data Science'
 'Education, Data Science' 'Biomedical Research, Computational Methods'
 'NLP, Computer Vision, Reinforcement Learning'
 'Data Engineering, Analytics' 'AI, IoT, Robotics, Logistics'
 'Flexible Electives, Data Science' 'Industry Applications, Data Science'
 'Covers digital transformation, AI, and data management.'
 'Astronomy, Particle Physics, Bioinformatics, Finance, Economics'
 'Entrepreneurship, Data Visualization, Data Science'
 'Machine Learning, Data Visualization, Data Engineering'
 'Public Health, Data 

In [12]:
# Simplified mapping dictionary
simplified_tracks_mapping = {
    "Data Mining, Machine Learning, AI": "AI and Machine Learning",
    "Data Science": "Data Science",
    "CS, Math, IT Tracks": "Interdisciplinary Tracks",
    "Computational methods, Statistical Methods": "Data Science",
    "Data Science, AI": "AI and Machine Learning",
    "Statistical Methods, Data Science": "Data Science",
    "Data Analytics, Engineering, Architecture": "Data Engineering",
    "Biometrics, Social Sciences, Business Analytics": "Business Analytics",
    "Business Analytics": "Business Analytics",
    "Data Science, Information Studies": "Data Science",
    "No Formal Tracks": "No Specialization",
    "Economics, Computer Science, Data Science": "Interdisciplinary Tracks",
    "Advanced Data Science": "Data Science",
    "Business Analytics, Finance": "Business Analytics",
    "Statistical Theory, Data Science": "Data Science",
    "Education, Data Science": "Education and Data Science",
    "Biomedical Research, Computational Methods": "Health and Biomedical Research",
    "NLP, Computer Vision, Reinforcement Learning": "AI and Machine Learning",
    "Data Engineering, Analytics": "Data Engineering",
    "AI, IoT, Robotics, Logistics": "AI and Robotics",
    "Flexible Electives, Data Science": "Data Science",
    "Industry Applications, Data Science": "Industry Applications",
    "Covers digital transformation, AI, and data management.": "Digital Transformation",
    "Astronomy, Particle Physics, Bioinformatics, Finance, Economics": "Research Focus",
    "Entrepreneurship, Data Visualization, Data Science": "Data Science",
    "Machine Learning, Data Visualization, Data Engineering": "AI and Machine Learning",
    "Public Health, Data Science": "Health and Biomedical Research",
    "Machine Learning, NLP, Social Network Analysis": "AI and Machine Learning",
    "Computer Science, Mathematics, Statistics": "Interdisciplinary Tracks",
    "Computer Science, Mathematics, Information Physics, Communication Engineering": "Interdisciplinary Tracks",
    "No Formal Specialization": "No Specialization",
    "Industry Applications": "Industry Applications",
    "Digital Transformation, Industry Applications": "Digital Transformation",
    "Complex Systems, Statistical Methods": "Research Focus",
}

# Apply the simplified mapping
df["Specializations/Tracks"] = df["Specializations/Tracks"].replace(simplified_tracks_mapping)

# Display the cleaned DataFrame to the user

# Print unique values to verify the result
print(df["Specializations/Tracks"].unique())


['AI and Machine Learning' 'Data Science' 'Interdisciplinary Tracks'
 'Data Engineering' 'Business Analytics' 'No Specialization'
 'Education and Data Science' 'Health and Biomedical Research'
 'AI and Robotics' 'Industry Applications' 'Digital Transformation'
 'Research Focus']


Core Courses 

In [14]:
unique_courses = df['Core Courses'].unique()
print(unique_courses)

['Algorithms and Complexity Theory (CSCE 525/5221) – 3 credits, Distributed Systems (CSCE 545/5241) – 3 credits, Advanced Artificial Intelligence (CSCE 565/5261) – 3 credits'
 'Algorithms II, Machine Learning, Applied Data Analysis, Statistics for Data Science, Systems for Data Management'
 'Data Analysis, Data Management, Machine Learning, Statistical Data Analysis'
 'Statistical Modeling, Machine Learning, Optimization, Data Management'
 'CSCI 101 Foundations of Data Science and Engineering, Core Data Science Courses, Data Science Electives'
 'Probability and Statistics, Computing, Optimization, AI/ML Core, Linear Algebra'
 'Applied Statistics, Big Data, Computational Statistics, Machine Learning, Deep Learning with TensorFlow'
 'Big Data Analytics, Data Programming, Research Methods in Data Science.'
 'Fundamental Concepts of Statistics, Linear Models, Multivariate Statistics, Statistical Consulting'
 'Quantitative Methods, Technology Automation, Business Application'
 'Theory of Co

In [15]:
import re

# Function to clean course descriptions
def clean_course_description(course):
    # Remove course codes and credits using regex
    cleaned_course = re.sub(r"\(.*?\)", "", course)  # Remove anything in parentheses
    cleaned_course = re.sub(r"– \d+ credits", "", cleaned_course)  # Remove "– X credits"
    return cleaned_course.strip()


# Apply the function to the 'Core Courses' column
df["Core Courses"] = df["Core Courses"].apply(clean_course_description)


# Print unique cleaned values for verification
print(df["Core Courses"].unique())


['Algorithms and Complexity Theory  , Distributed Systems  , Advanced Artificial Intelligence'
 'Algorithms II, Machine Learning, Applied Data Analysis, Statistics for Data Science, Systems for Data Management'
 'Data Analysis, Data Management, Machine Learning, Statistical Data Analysis'
 'Statistical Modeling, Machine Learning, Optimization, Data Management'
 'CSCI 101 Foundations of Data Science and Engineering, Core Data Science Courses, Data Science Electives'
 'Probability and Statistics, Computing, Optimization, AI/ML Core, Linear Algebra'
 'Applied Statistics, Big Data, Computational Statistics, Machine Learning, Deep Learning with TensorFlow'
 'Big Data Analytics, Data Programming, Research Methods in Data Science.'
 'Fundamental Concepts of Statistics, Linear Models, Multivariate Statistics, Statistical Consulting'
 'Quantitative Methods, Technology Automation, Business Application'
 'Theory of Computation, Algorithms, Machine Learning, Data Mining'
 'Data Science Programming

In [None]:
data['Core Courses'] = data['Core Courses'].str.replace('CSCI 101 , ', '')
    data['Core Courses'] = data['Core Courses'].str.replace('2.0 graduate FCEs in computer science then change, satisfying breadth requirements.', 'School-specific course')
    return data


In [19]:
import re

# Function to clean course descriptions
def clean_course_description(course):
    # Remove specific phrases and patterns
    course = re.sub(r"2.0 graduate FCEs in computer science.*?breadth requirements\.", "School Specific Course", course)
    course = re.sub(r"\bCSCI\s\d+\b", "", course)
    course = re.sub(r"\(.*?\)", "", course)  # Remove text within parentheses
    course = re.sub(r"– \d+ credits", "", course)  # Remove "– X credits"
    course = re.sub(r"\s+", " ", course)  # Replace multiple spaces with a single space
    return course.strip()  # Remove leading/trailing whitespace

# Apply the function to the 'Core Courses' column
df["Core Courses"] = df["Core Courses"].apply(clean_course_description)

# Display the cleaned DataFrame to the user

# Print unique cleaned values for verification
print(df["Core Courses"].unique())


['Algorithms and Complexity Theory , Distributed Systems , Advanced Artificial Intelligence'
 'Algorithms II, Machine Learning, Applied Data Analysis, Statistics for Data Science, Systems for Data Management'
 'Data Analysis, Data Management, Machine Learning, Statistical Data Analysis'
 'Statistical Modeling, Machine Learning, Optimization, Data Management'
 'Foundations of Data Science and Engineering, Core Data Science Courses, Data Science Electives'
 'Probability and Statistics, Computing, Optimization, AI/ML Core, Linear Algebra'
 'Applied Statistics, Big Data, Computational Statistics, Machine Learning, Deep Learning with TensorFlow'
 'Big Data Analytics, Data Programming, Research Methods in Data Science.'
 'Fundamental Concepts of Statistics, Linear Models, Multivariate Statistics, Statistical Consulting'
 'Quantitative Methods, Technology Automation, Business Application'
 'Theory of Computation, Algorithms, Machine Learning, Data Mining'
 'Data Science Programming, Database 

In [21]:
from collections import Counter

# Example list of lists
core_courses = df["Core Courses"].unique()

# Step 1: Tokenize and clean the data
# Split each string into individual items and clean up whitespace
tokens = []
for course in core_courses:
    tokens.extend([token.strip() for token in course.split(",")])  # Split by comma and strip whitespace

# Step 2: Count occurrences
token_counts = Counter(tokens)

# Step 3: Find the most common items
most_common_items = token_counts.most_common()

# Output the most common items
for item, count in most_common_items:
    print(f"{item}: {count}")


Machine Learning: 13
Data Mining: 5
Data Analysis: 4
Data Management: 3
Statistics for Data Science: 2
Statistical Data Analysis: 2
Optimization: 2
Statistical Inference: 2
Regression Models: 2
Data Visualization: 2
AI: 2
Applied Machine Learning: 2
Big Data Technologies: 2
Statistical Methods: 2
Digital Transformation: 2
Eco-Innovation: 2
Algorithms and Complexity Theory: 1
Distributed Systems: 1
Advanced Artificial Intelligence: 1
Algorithms II: 1
Applied Data Analysis: 1
Systems for Data Management: 1
Statistical Modeling: 1
Foundations of Data Science and Engineering: 1
Core Data Science Courses: 1
Data Science Electives: 1
Probability and Statistics: 1
Computing: 1
AI/ML Core: 1
Linear Algebra: 1
Applied Statistics: 1
Big Data: 1
Computational Statistics: 1
Deep Learning with TensorFlow: 1
Big Data Analytics: 1
Data Programming: 1
Research Methods in Data Science.: 1
Fundamental Concepts of Statistics: 1
Linear Models: 1
Multivariate Statistics: 1
Statistical Consulting: 1
Quantit

In [22]:
# Count occurrences
token_counts = Counter(tokens)

# Output the unique items and their counts
print(f"Number of Unique Items: {len(token_counts)}")
print("Counts of each unique item:", dict(token_counts))


Number of Unique Items: 120
Counts of each unique item: {'Algorithms and Complexity Theory': 1, 'Distributed Systems': 1, 'Advanced Artificial Intelligence': 1, 'Algorithms II': 1, 'Machine Learning': 13, 'Applied Data Analysis': 1, 'Statistics for Data Science': 2, 'Systems for Data Management': 1, 'Data Analysis': 4, 'Data Management': 3, 'Statistical Data Analysis': 2, 'Statistical Modeling': 1, 'Optimization': 2, 'Foundations of Data Science and Engineering': 1, 'Core Data Science Courses': 1, 'Data Science Electives': 1, 'Probability and Statistics': 1, 'Computing': 1, 'AI/ML Core': 1, 'Linear Algebra': 1, 'Applied Statistics': 1, 'Big Data': 1, 'Computational Statistics': 1, 'Deep Learning with TensorFlow': 1, 'Big Data Analytics': 1, 'Data Programming': 1, 'Research Methods in Data Science.': 1, 'Fundamental Concepts of Statistics': 1, 'Linear Models': 1, 'Multivariate Statistics': 1, 'Statistical Consulting': 1, 'Quantitative Methods': 1, 'Technology Automation': 1, 'Business A

In [23]:
%pip install spacy

Collecting spacy
  Downloading spacy-3.8.2.tar.gz (1.3 MB)
     ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
     -------- ------------------------------- 0.3/1.3 MB ? eta -:--:--
     ---------------- ----------------------- 0.5/1.3 MB 1.7 MB/s eta 0:00:01
     -------------------------------- ------- 1.0/1.3 MB 1.9 MB/s eta 0:00:01
     ---------------------------------------- 1.3/1.3 MB 1.8 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [113 lines of output]
      Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
      Collecting setuptools
        Downloading setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
      Collecting cython<3.0,>=0.25
        Downloading Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Downloading cymem-2.0.8.tar.gz (9.8 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): finished with status 'done'
      Collecting preshed<3.1.0,>=3.0.2
        Downloading preshed-3.0.9.tar.gz (14 kB)
      

In [None]:
import spacy
from collections import Counter

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Example data
core_courses = df["Core Courses"].unique()

# Process text and extract lemmas
tokens = []
for course in core_courses:
    doc = nlp(course)  # Process text with spaCy
    for token in doc:
        if not token.is_stop and not token.is_punct:  # Remove stopwords and punctuation
            tokens.append(token.lemma_.lower())  # Use lemma (base form)

# Count occurrences
token_counts = Counter(tokens)

# Output results
print("Most Common Lemmatized Terms:")
for term, count in token_counts.most_common():
    print(f"{term}: {count}")
