In [1]:
import json
import pandas as pd

# Load your uploaded file
with open("../data/nusmods_2025_2026_moduleInfo.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

df = df[["moduleCode", "title", "description"]]
df.columns = ["course", "title", "description"]
print(df.columns)
df.head()

Index(['course', 'title', 'description'], dtype='object')


Unnamed: 0,course,title,description
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...
3,ABM5004,Capstone Project,This course encompasses research projects rele...
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...


In [2]:
print(df.shape)

(16531, 3)


In [4]:
import re
import pandas as pd

# --- Step 1: Extract base module code ---
def simplify_module_code(code: str) -> str:
    """
    Simplify module code by removing trailing letters.
    Example: CS1010A -> CS1010
    """
    if not isinstance(code, str):
        return None
    return re.sub(r"[A-Za-z]+$", "", code)

df["baseCode"] = df["course"].apply(simplify_module_code)

# --- Step 2: Keep only rows whose moduleCode ends with a digit ---
df_filtered = df[df["course"].str[-1].str.isdigit()]

# --- Step 3: Drop duplicates by baseCode, keeping the first one ---
df_filtered = df_filtered.drop_duplicates(subset="baseCode", keep="first").reset_index(drop=True)

# --- Step 4: (Optional) Drop helper column ---
df_filtered = df_filtered.drop(columns=["baseCode"])

df_filtered[df_filtered["course"].isin(["CS1010", "CS2040", "CS2100", "ACC1701"])]


Unnamed: 0,course,title,description
23,ACC1701,Accounting for Decision Makers,The course provides an introduction to account...
1897,CS1010,Programming Methodology,This course introduces the fundamental concept...
1904,CS2040,Data Structures and Algorithms,This course introduces students to the design ...
1905,CS2100,Computer Organisation,The objective of this course is to familiarise...


In [5]:
df_filtered.shape

(11627, 3)

In [6]:
# Save DataFrame to CSV in current working directory
df_filtered.to_csv("../data/nusmods_modules_no_levelprefix.csv", index=False, encoding="utf-8")

In [7]:
def extract_level(course):
    try:
        num = int(course[-4:])
        return (num // 1000) * 1000
    except ValueError:
        return None
    

In [8]:
df_filtered['level'] = df_filtered['course'].apply(extract_level)
df_filtered

Unnamed: 0,course,title,description,level
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000.0
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000.0
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000.0
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000.0
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000.0
...,...,...,...,...
11622,ZB3312,Enhanced Undergraduate Professional Internship...,In addition to having an academic science foun...,3000.0
11623,ZB3313,Undergraduate Professional Internship Programm...,In addition to having an academic science foun...,3000.0
11624,ZB4171,Advanced Topics in Bioinformatics,This is a seminar-style course based on the li...,4000.0
11625,ZB4199,Honours Project in Computational Biology,Not Available,4000.0


In [9]:
missing_level = df_filtered[df_filtered['level'].isna()]
missing_level

Unnamed: 0,course,title,description,level
2211,DMA1401L01,Design Your Own Course,Not Available,
2212,DMA1401L02,Design Your Own Course,Not Available,
2213,DMA1402L01,Design Your Own Course,Not Available,


In [10]:
df_dropped = df_filtered.dropna(subset=['level'])
df_dropped = df_dropped.astype({'level': int}).reset_index(drop=True)
df_dropped

Unnamed: 0,course,title,description,level
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000
...,...,...,...,...
11619,ZB3312,Enhanced Undergraduate Professional Internship...,In addition to having an academic science foun...,3000
11620,ZB3313,Undergraduate Professional Internship Programm...,In addition to having an academic science foun...,3000
11621,ZB4171,Advanced Topics in Bioinformatics,This is a seminar-style course based on the li...,4000
11622,ZB4199,Honours Project in Computational Biology,Not Available,4000


In [11]:
df_dropped["level"].unique()

array([5000, 1000, 2000, 3000, 4000, 6000, 8000, 9000])

In [12]:
df_dropped["prefix"] = df_dropped["course"].str.extract(r"^([A-Za-z]+)")


In [13]:
df_dropped.head()

Unnamed: 0,course,title,description,level,prefix
0,ABM5001,Leadership in Biomedicine,Leadership is fundamental to the success of in...,5000,ABM
1,ABM5002,Advanced Biostatistics for Research,This course is served as a concept-based intro...,5000,ABM
2,ABM5003,Biomedical Innovation & Enterprise,This course will furnish students with a thoro...,5000,ABM
3,ABM5004,Capstone Project,This course encompasses research projects rele...,5000,ABM
4,ABM5101,Applied Immunology,Advanced immunological applications play impor...,5000,ABM


In [14]:
print(df_dropped["prefix"].unique())
df_dropped["prefix"].nunique()

['ABM' 'AC' 'ACC' 'ACE' 'ADS' 'AH' 'AI' 'AII' 'AIS' 'ALS' 'AN' 'AR' 'ARD'
 'ASP' 'AUD' 'AX' 'AY' 'BAA' 'BBB' 'BBP' 'BDC' 'BHD' 'BI' 'BIH' 'BIS' 'BL'
 'BLD' 'BMA' 'BMC' 'BMD' 'BME' 'BMF' 'BMG' 'BMH' 'BMI' 'BMK' 'BMO' 'BMP'
 'BMS' 'BMT' 'BMU' 'BMX' 'BN' 'BPM' 'BPS' 'BRP' 'BS' 'BSE' 'BSN' 'BSP'
 'BSS' 'BST' 'BT' 'BV' 'BWS' 'BX' 'BZD' 'CAH' 'CAS' 'CCS' 'CD' 'CDE' 'CDM'
 'CE' 'CEG' 'CFA' 'CFG' 'CG' 'CH' 'CHC' 'CIT' 'CK' 'CL' 'CLC' 'CM' 'CMH'
 'CN' 'COS' 'CP' 'CS' 'CSA' 'CSX' 'CZ' 'DAO' 'DBA' 'DE' 'DEP' 'DI' 'DL'
 'DMC' 'DMX' 'DOS' 'DSA' 'DSC' 'DSE' 'DSN' 'DSS' 'DTK' 'DTS' 'DY' 'EBA'
 'EC' 'ECA' 'EE' 'EEK' 'EG' 'EHB' 'EL' 'ELC' 'EM' 'EN' 'ENC' 'ENV' 'ES'
 'ESE' 'ESP' 'ETP' 'EU' 'EX' 'FAS' 'FDP' 'FE' 'FIN' 'FSC' 'FSP' 'FST' 'FT'
 'GE' 'GEA' 'GEC' 'GEH' 'GEI' 'GEK' 'GEM' 'GEN' 'GES' 'GESS' 'GET' 'GEX'
 'GL' 'GMS' 'GPM' 'GS' 'GSA' 'GSN' 'GSS' 'HE' 'HI' 'HLE' 'HM' 'HS' 'HSA'
 'HSH' 'HSI' 'HSS' 'HY' 'IAN' 'ID' 'IDE' 'IDS' 'IDX' 'IE' 'IFS' 'IGL'
 'IJS' 'IND' 'INM' 'INT' 'IPM' 'IPS' 'IS' 'ISC' 'ISD

371

In [15]:
# Save DataFrame to CSV in current working directory
df_dropped.to_csv("../data/nusmods_modules_with_levelprefix.csv", index=False, encoding="utf-8")

In [16]:
df_dropped[df_dropped['level'] == 2000]

Unnamed: 0,course,title,description,level,prefix
24,ACC2706,Managerial Accounting,"This course covers major concepts, tools and t...",2000,ACC
25,ACC2707,Corporate Accounting & Reporting I,The course covers financial accounting at an i...,2000,ACC
26,ACC2708,Corporate Accounting & Reporting II,The course covers financial accounting at an i...,2000,ACC
27,ACC2709,Accounting Information Systems,This course aims to help students understand t...,2000,ACC
28,ACC2711,Sustainability Metrics and Performance Analysis,In this course we examine how companies are us...,2000,ACC
...,...,...,...,...,...
11370,YSS2252,What is the Global South: Africa in the World,What is ‘Africa’ and who is African? Is the Af...,2000,YSS
11371,YSS2253,Divided Cities,This course investigates the social divisions ...,2000,YSS
11372,YSS2254,Statistics for Psychology using SPSS,The course introduces the main descriptive and...,2000,YSS
11612,ZB2101,Introductory Bioinformatics,"Students will be introduced to the concepts, t...",2000,ZB
