In [10]:
import numpy as np
import pandas as pd

In [11]:
df_read = pd.read_csv("../data/nusmods_modules_with_levelprefix.csv")
df_read.head()

Unnamed: 0,course,title,description,level,prefix
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000,ABM
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000,ABM
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000,ABM
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000,ABM
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000,ABM


In [12]:
df_read = df_read.dropna(subset=["description"]).loc[df_read['description'].str.strip() != '']

In [13]:
import pandas as pd
from collections import defaultdict

def find_cross_listed_courses(df):
    # Create a dictionary to store descriptions and their corresponding courses
    description_to_courses = defaultdict(list)
    
    # Group courses by their descriptions
    for _, row in df.iterrows():
        description_to_courses[row['description']].append(row['course'])
    
    count_duplicates = 0
    for desc, courses in description_to_courses.items():
        if len(courses) > 1:
            count_duplicates += len(courses) - 1

    # Filter out descriptions with more than one course (cross-listed)
    cross_listed = {desc: courses for desc, courses in description_to_courses.items() if len(courses) > 1}
    
    # Count total number of cross-listed course sets
    total_cross_listed_sets = len(cross_listed)
    
   
    
    print(f"Total number of cross-listed course sets: {total_cross_listed_sets}")
    print(f"Total number of duplicate courses due to cross-listing: {count_duplicates}")
    
    return cross_listed

def clean_cross_listed_courses(df):
    # Find cross-listed courses
    cross_listed = find_cross_listed_courses(df)
    
    # Create a new DataFrame to store cleaned data
    cleaned_df = df.copy()
    
    # List to keep track of courses to drop
    courses_to_drop = []
    
    # Process each set of cross-listed courses
    for description, courses in cross_listed.items():
        # Sort courses alphabetically to choose the base course
        sorted_courses = sorted(courses)
        base_course = sorted_courses[0]
        
        # Modify the base course entry
        base_course_index = df[df['course'] == base_course].index[0]
        
        # Create cross-listing string for other courses
        cross_list_string = f"{base_course} (Cross-listed as {', '.join(sorted_courses[1:])})"
        cleaned_df.at[base_course_index, 'course'] = cross_list_string
        
        # Mark other courses for removal
        courses_to_drop.extend(sorted_courses[1:])
    
    # Remove duplicate courses
    cleaned_df = cleaned_df[~cleaned_df['course'].isin(courses_to_drop)]
    
    return cleaned_df

In [14]:
cleaned_df_pickle = clean_cross_listed_courses(df_read)
cleaned_df_pickle.reset_index(drop=True, inplace=True)

Total number of cross-listed course sets: 890
Total number of duplicate courses due to cross-listing: 2158


In [15]:
cleaned_df_pickle["ori_course_code"] = cleaned_df_pickle["course"].str.split(" ").str[0]

In [16]:
cleaned_df_pickle

Unnamed: 0,course,title,description,level,prefix,ori_course_code
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000,ABM,ABM5001
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000,ABM,ABM5002
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000,ABM,ABM5003
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000,ABM,ABM5004
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000,ABM,ABM5101
...,...,...,...,...,...,...
9118,ZB3288,Advanced UROPS in Computational Biology I,This course is intended for students to conduc...,3000,ZB,ZB3288
9119,ZB3289,Advanced UROPS in Computational Biology II,This course is intended for students to conduc...,3000,ZB,ZB3289
9120,ZB3310 (Cross-listed as ZB3311),Undergraduate Professional Internship Programme,In addition to having an academic science foun...,3000,ZB,ZB3310
9121,ZB3312,Enhanced Undergraduate Professional Internship...,In addition to having an academic science foun...,3000,ZB,ZB3312


In [17]:
cleaned_df_pickle.to_pickle("cleaned_nusmods.pkl")