In [386]:
%reset -f

In [387]:
# Import packages
import os
import pandas as pd

# Set base path for reproducibility
# Save initial data to a folder "Capstone Project Data" in the current working directory
base_path = os.path.join(os.getcwd(), "Capstone Project Data")

# Set file paths for each semester
file_fall21 = os.path.join(base_path, "College of InfoSci_Fall 2021 - CLEANED.xlsx")
file_fall22 = os.path.join(base_path, "College of InfoSci_Fall 2022 - CLEANED.xlsx")
file_fall23 = os.path.join(base_path, "College of InfoSci_Fall 2023 - CLEANED.xlsx")
file_fall24 = os.path.join(base_path, "College of InfoSci_Fall 2024 - CLEANED.xlsx")
file_spring22 = os.path.join(base_path, "College of InfoSci_Spring 2022 - CLEANED.xlsx")
file_spring23 = os.path.join(base_path, "College of InfoSci_Spring 2023 - CLEANED.xlsx")
file_spring24 = os.path.join(base_path, "College of InfoSci_Spring 2024 - CLEANED.xlsx")
file_spring25 = os.path.join(base_path, "College of InfoSci_Spring 2025 - CLEANED.xlsx")


In [388]:
# Load each file into a separate DataFrame

# Fall DFs
df_fall21 = pd.read_excel(file_fall21)
df_fall22 = pd.read_excel(file_fall22)
df_fall23 = pd.read_excel(file_fall23)
df_fall24 = pd.read_excel(file_fall24)

# Spring DFs
df_spring22 = pd.read_excel(file_spring22)
df_spring23 = pd.read_excel(file_spring23)
df_spring24 = pd.read_excel(file_spring24)
df_spring25 = pd.read_excel(file_spring25)

Create Dictionary (dfs) for all our DataFrames. Will use the Dictionary for data cleansing with for-loops

In [389]:
# Dictionary of all our DataFrames
dfs = {
    "df_fall21": df_fall21,
    "df_fall22": df_fall22,
    "df_fall23": df_fall23,
    "df_fall24": df_fall24,
    "df_spring22": df_spring22,
    "df_spring23": df_spring23,
    "df_spring24": df_spring24,
    "df_spring25": df_spring25
}

In [390]:
# Check data types from first DF
print(df_fall21.dtypes)

Term                            object
Session                         object
Session Code                    object
Campus                          object
Facility                        object
Room Capacity                    int64
Subject Code                    object
Catalog Number                  object
Class Section                   object
Req Desig                       object
Component                       object
Min Units                        int64
Max Units                        int64
Combined Section                object
Enrollment Status               object
Instruction Mode                object
Class Number                     int64
Course Description              object
Class Status Code               object
Start Date              datetime64[ns]
End Date                datetime64[ns]
Class Meeting Number           float64
Meeting Days                    object
Meeting Time Start              object
Meeting Time End                object
Total Enroll             

In [391]:
# Check data types for each DF

for df_semester, df in dfs.items():
    print(f"\nData types for {df_semester}:")
    print(df.dtypes)


Data types for df_fall21:
Term                            object
Session                         object
Session Code                    object
Campus                          object
Facility                        object
Room Capacity                    int64
Subject Code                    object
Catalog Number                  object
Class Section                   object
Req Desig                       object
Component                       object
Min Units                        int64
Max Units                        int64
Combined Section                object
Enrollment Status               object
Instruction Mode                object
Class Number                     int64
Course Description              object
Class Status Code               object
Start Date              datetime64[ns]
End Date                datetime64[ns]
Class Meeting Number           float64
Meeting Days                    object
Meeting Time Start              object
Meeting Time End                objec

Each dataframe has the same data types

Update Data Types into Integer: Room Capacity, Min Units, Max Units, Class Number, Class Meeting Number

In [392]:
# Columns to convert to Int64
columns_to_change = ["Room Capacity", "Min Units", "Max Units", "Class Number", "Class Meeting Number"]

# Iterate through our dictionary (dfs), defined above
for df_semester, df in dfs.items():
    df[columns_to_change] = df[columns_to_change].astype("Int64")    # Convert to Int64
    dfs[df_semester] = df                                            # Update dictionary
    print(f"{df_semester}: Selected columns converted to Int64")

df_fall21: Selected columns converted to Int64
df_fall22: Selected columns converted to Int64
df_fall23: Selected columns converted to Int64
df_fall24: Selected columns converted to Int64
df_spring22: Selected columns converted to Int64
df_spring23: Selected columns converted to Int64
df_spring24: Selected columns converted to Int64
df_spring25: Selected columns converted to Int64


Filter Data to Undergraduate-Only

In [393]:
for df_semester, df in dfs.items():
    # Extract First Digit from "Catalog Number" column, convert to float
    first_digit = df["Catalog Number"].astype(str).str[0].str.extract(r"(\d)").astype(float)

    # Add a temporary column to store the extracted digit
    df = df.copy()
    df["Catalog_First_Digit"] = first_digit

    # Filter to keep only undergrad courses (Catalog Number starting with <5)
    df_ugrad = df[df["Catalog_First_Digit"] < 5].copy()

    # Drop the temporary column 
    df_ugrad.drop(columns="Catalog_First_Digit", inplace=True)

    # Update the dictionary with the cleaned, undergrad-only version
    dfs[df_semester] = df_ugrad

    print(f"{df_semester}: {df.shape[0]} original rows --> {df_ugrad.shape[0]} undergrad rows kept")

    #Update the DFs too, not just dictionary, to check 
    globals()[df_semester] = dfs[df_semester]

print("\n")

# Check if there's any grad classes 
grad_remaining = False

for df_semester, df in dfs.items():
    grad_rows = (df["Catalog Number"].astype(str).str[0].astype(int) >= 5).sum()
    
    if grad_rows > 0:
        grad_remaining = True
        print(f"{df_semester}: {grad_rows} grad rows still present") # print DF name and # of grad rows still present

if not grad_remaining:
    print("Success! No grad rows remaining in any DataFrame")


df_fall21: 346 original rows --> 212 undergrad rows kept
df_fall22: 345 original rows --> 196 undergrad rows kept
df_fall23: 415 original rows --> 234 undergrad rows kept
df_fall24: 433 original rows --> 257 undergrad rows kept
df_spring22: 339 original rows --> 198 undergrad rows kept
df_spring23: 378 original rows --> 212 undergrad rows kept
df_spring24: 409 original rows --> 244 undergrad rows kept
df_spring25: 470 original rows --> 273 undergrad rows kept




Success! No grad rows remaining in any DataFrame


Remove unnecessary data:
1) Component > Independent Study
2) Class Status Code > T
3) Req Desig > HONR

In [394]:
for df_semester, df in dfs.items():

    # Count how many rows match each condition - to print once removed
    original_rowcount = len(df)              # Original - to subtract from total
    count_independent = (df["Component"] == "Independent Study").sum()
    count_t = (df["Class Status Code"] == "T").sum()
    count_honor = (df["Req Desig"] == "HONR").sum()

    # Remove rows where ANY of the 3 conditions are True
    df_cleaned = df[
        ~((df["Component"] == "Independent Study") |
          (df["Class Status Code"] == "T") |
          (df["Req Desig"] == "HONR"))]

    dfs[df_semester] = df_cleaned
    new_count = len(df_cleaned)
    total_removed = original_rowcount - new_count

    # Print what was removed
    print(f"{df_semester}:")
    print(f"Removed {count_independent} Independent Study rows")
    print(f"Removed {count_t} T-status rows")
    print(f"Removed {count_honor} HONR rows")
    print(f"Total removed: {total_removed} rows\n")

    #Update the DFs too, not just dictionary, to manually check 
    globals()[df_semester] = dfs[df_semester]


df_fall21:
Removed 24 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 24 rows

df_fall22:
Removed 26 Independent Study rows
Removed 0 T-status rows
Removed 5 HONR rows
Total removed: 26 rows

df_fall23:
Removed 39 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 39 rows

df_fall24:
Removed 44 Independent Study rows
Removed 0 T-status rows
Removed 7 HONR rows
Total removed: 44 rows

df_spring22:
Removed 24 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 24 rows

df_spring23:
Removed 29 Independent Study rows
Removed 0 T-status rows
Removed 8 HONR rows
Total removed: 29 rows

df_spring24:
Removed 37 Independent Study rows
Removed 1 T-status rows
Removed 9 HONR rows
Total removed: 38 rows

df_spring25:
Removed 37 Independent Study rows
Removed 6 T-status rows
Removed 8 HONR rows
Total removed: 43 rows



Next, look for In-Person Classes without Dates or Start/End Times

In [395]:
#Check In-Person Classes without Dates or Start/End Times

days_times_columns = ["Meeting Days", "Meeting Time Start", "Meeting Time End"]

# List to store missing data results & which columns to show in the final summary
filtered_rows = []
filtered_columns_to_show = ["Component", "Class Number", "Course Description", "Meeting Days", "Meeting Time Start", "Meeting Time End"]

# Empty dictionary to store missing data report
missing_data_report = {}

# Iterate through each DataFrame in our dictionary "dfs"
for df_semester, df in dfs.items():
    condition = (                                   # Conditions: In Person & not Independent Study           
        (df["Instruction Mode"] == "In Person") &  
        (df["Component"] != "Independent Study")) 
    
    # Find missing observations ("-") in our days_times_columns columns
    missing_days_times = df[condition & df[days_times_columns].isin(["-"]).any(axis=1)] # .any() for any column with "-", not just 3 for 3
    
    # If missing data exists ("-"), store in missing_data_report
    if not missing_days_times.empty:
        missing_data_report[df_semester] = missing_days_times.copy()

# Print results
if missing_data_report:
    for df_semester, df_missing in missing_data_report.items():
        if isinstance(df_missing, pd.DataFrame):    # Confirm df_missing is a DataFrame before data manipulation
            #Set up our empty list (filtered_rows) to store the rows with missing data in our selected columns
            selected_data = df_missing[filtered_columns_to_show].copy() # Create new DF with our selected columns
            selected_data.insert(0, "DataFrame Name", df_semester)      # Insert new column for the Dataframe Name
            filtered_rows.append(selected_data)                         # Add to list filtered_rows by appending selected_data

    # Print results
    if filtered_rows:
        missing_data_summary = pd.concat(filtered_rows, ignore_index = True) # Transform our list (filtered_rows) to DF
    
        print("\nIn-Person Classes without Dates or Start/End Times:")
        print(missing_data_summary)
        

else:
    print("No missing schedule info found!")


No missing schedule info found!


KDD: Print Each DF's unique values by column & their respective counts

In [396]:
for df_semester, df in dfs.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)



=== DataFrame: df_fall21 ===

Column: Term
Term
Fall 2021    188
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    130
Seven Week - First           32
Seven Week - Second          26
Name: count, dtype: int64

Column: Session Code
Session Code
1      130
7W1     32
7W2     26
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    126
Arizona Online                   62
Name: count, dtype: int64

Column: Facility
Facility
Online                       142
Live Online                   13
Elec & Comp Engr, Rm 107       6
Haury Anthro Bldg, Rm 216      3
R P Harvill Bldg, Rm 415       2
R P Harvill Bldg, Rm 402       2
R P Harvill Bldg, Rm 401       2
R P Harvill Bldg, Rm 319       2
Modern Languages, Rm 311       2
Aero & Mech Engr, Rm S324      2
R P Harvill Bldg, Rm 204       1
R P Harvill Bldg, Rm 305       1
R P Harvill Bldg, Rm 232       1
R P Harvill Bldg, Rm 101       1
R P Harvill Bldg, Rm 134       1
R P Harvill Bldg, Rm 130

Class Meeting Number
1    170
Name: count, dtype: Int64

Column: Meeting Days
Meeting Days
-     138
TR     23
MW      3
F       3
M       3
Name: count, dtype: int64

Column: Meeting Time Start
Meeting Time Start
00:00:00    138
12:30:00      9
15:30:00      7
11:00:00      7
14:00:00      3
16:00:00      1
13:00:00      1
08:00:00      1
10:00:00      1
09:00:00      1
15:00:00      1
Name: count, dtype: int64

Column: Meeting Time End
Meeting Time End
00:00:00    138
13:45:00      9
16:45:00      7
12:15:00      6
15:15:00      3
17:15:00      1
12:50:00      1
14:50:00      1
09:50:00      1
11:50:00      1
10:50:00      1
16:50:00      1
Name: count, dtype: int64

Column: Total Enroll
Total Enroll
1      13
9      12
20     12
0       8
23      8
3       7
24      7
26      6
28      6
17      6
30      5
14      5
16      5
8       5
2       4
5       4
25      4
12      3
13      3
15      3
32      3
22      3
4       3
21      3
35      3
7       3
10      3
33      2
29      

Now that data is cleaned, Combine into Fall, Spring, and All DataFrames

In [397]:
#Combined DataFrames: Fall, Spring, and All
df_fall_all = pd.concat(
    [dfs["df_fall21"], dfs["df_fall22"], dfs["df_fall23"], dfs["df_fall24"]],
    ignore_index=True)

df_spring_all = pd.concat(
    [dfs["df_spring22"], dfs["df_spring23"], dfs["df_spring24"], dfs["df_spring25"]],
    ignore_index=True)

df_all = pd.concat([df_fall_all, df_spring_all], ignore_index=True)

dfs_cleaned = {
    "df_fall_all": df_fall_all,
    "df_spring_all": df_spring_all,
    "df_all": df_all
    }

KDD: Print Unique Values by Columns again, now for our combined dataframes

In [398]:
for df_semester, df in dfs_cleaned.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all ===

Column: Term
Term
Fall 2024    213
Fall 2023    195
Fall 2021    188
Fall 2022    170
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    490
Seven Week - First          142
Seven Week - Second         134
Name: count, dtype: int64

Column: Session Code
Session Code
1      490
7W1    142
7W2    134
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    497
Arizona Online                  269
Name: count, dtype: int64

Column: Facility
Facility
Online                        580
Live Online                    21
R P Harvill Bldg, Rm 402       13
R P Harvill Bldg, Rm 428        8
Elec & Comp Engr, Rm 107        8
                             ... 
C E Chavez Bldg, Rm 303         1
Communication, Rm 214           1
Education, Rm 535               1
Mines & Metallurgy, Rm 213      1
Social Sciences, Rm 206         1
Name: count, Length: 73, dtype: int64

Column: Room Capacity
Room Capacity
1      605
40    

KDD: Explore Co-Convened Classes

In [399]:
for df_semester, df in dfs_cleaned.items():
    # Find for co-convened courses
    co_convened_rows = df[df["Combined Section"] != "-"] # Any value besides "-" is co-convened

    # Count total number of co-convened courses (rows) & how many are unique ("combined section")
    co_convened_total = co_convened_rows.shape[0]
    co_convened_unique = co_convened_rows["Combined Section"].nunique()

    print(f"{df_semester}: {co_convened_total} co-convened courses, {co_convened_unique} unique co-convened course groups (Combined Sections)")
print("\n")

for df_semester, df in dfs.items():
    # Find for co-convened courses
    co_convened_rows = df[df["Combined Section"] != "-"] # Any value besides "-" is co-convened

    # Count total number of co-convened courses (rows) & how many are unique ("combined section")
    co_convened_total = co_convened_rows.shape[0]
    co_convened_unique = co_convened_rows["Combined Section"].nunique()

    print(f"{df_semester}: {co_convened_total} co-convened courses, {co_convened_unique} unique co-convened course groups (Combined Sections)")


df_fall_all: 555 co-convened courses, 295 unique co-convened course groups (Combined Sections)
df_spring_all: 580 co-convened courses, 291 unique co-convened course groups (Combined Sections)
df_all: 1135 co-convened courses, 537 unique co-convened course groups (Combined Sections)


df_fall21: 136 co-convened courses, 74 unique co-convened course groups (Combined Sections)
df_fall22: 126 co-convened courses, 71 unique co-convened course groups (Combined Sections)
df_fall23: 142 co-convened courses, 82 unique co-convened course groups (Combined Sections)
df_fall24: 151 co-convened courses, 83 unique co-convened course groups (Combined Sections)
df_spring22: 129 co-convened courses, 71 unique co-convened course groups (Combined Sections)
df_spring23: 134 co-convened courses, 69 unique co-convened course groups (Combined Sections)
df_spring24: 155 co-convened courses, 85 unique co-convened course groups (Combined Sections)
df_spring25: 162 co-convened courses, 88 unique co-convened cours

New Data Frames: In-Person vs Online 

(from dfs_cleaned: df_all, df_fall_all, df_spring_all)

In [400]:
# In-Person vs Online DataFrames

# Empty dictionary to store the new split (In-Person/Online) DFs
dfs_mode_split = {}

# Loop through the cleaned DataFrames in dict dfs_cleaned
for name, df in dfs_cleaned.items():
    df = df.copy()

    # Online Vs In-Person: Defined by values in "Facility" (have more options if this)
    df_online = df[df["Facility"].isin(["Online", "Live Online"])].copy()
    df_inperson = df[~df["Facility"].isin(["Online", "Live Online"])].copy()

    # Split our original DFs into new DFs: In-Person & Online
    dfs_mode_split[f"{name}_online"] = df_online
    dfs_mode_split[f"{name}_inperson"] = df_inperson

# Make Jupyter Variables for our new In-Person/Online DataFrames
for name, df in dfs_mode_split.items():
    globals()[name] = df

In [401]:
# Print name & numbers of rows/columns by each In-Person/Online DF
for name, df in dfs_mode_split.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")


df_fall_all_online: 601 rows, 27 columns
df_fall_all_inperson: 165 rows, 27 columns
df_spring_all_online: 619 rows, 27 columns
df_spring_all_inperson: 174 rows, 27 columns
df_all_online: 1220 rows, 27 columns
df_all_inperson: 339 rows, 27 columns


KDD: Print Unique Values & Counts for New In-Person/Online DFs

In [402]:
for df_semester, df in dfs_mode_split.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all_online ===

Column: Term
Term
Fall 2023    157
Fall 2021    155
Fall 2024    151
Fall 2022    138
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    325
Seven Week - First          142
Seven Week - Second         134
Name: count, dtype: int64

Column: Session Code
Session Code
1      325
7W1    142
7W2    134
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    332
Arizona Online                  269
Name: count, dtype: int64

Column: Facility
Facility
Online         580
Live Online     21
Name: count, dtype: int64

Column: Room Capacity
Room Capacity
1    601
Name: count, dtype: Int64

Column: Subject Code
Subject Code
ESOC    238
ISTA    235
GAME     68
LIS      46
INFO     14
Name: count, dtype: int64

Column: Catalog Number
Catalog Number
302      34
116      34
130      32
301      23
300      20
314      20
150B1    20
317      16
251      16
321      16
161      16
322      14
351      14
470    

KDD Online: Total, Fully Online, Live Online

In [403]:
# Dictionary for just the Online DFs
dfs_online = {
    "df_fall_all_online": dfs_mode_split["df_fall_all_online"],
    "df_spring_all_online": dfs_mode_split["df_spring_all_online"],
    "df_all_online": dfs_mode_split["df_all_online"]
}

for df_name, df in dfs_online.items():
    online_total = df.shape[0]
    fullyonline = (df["Facility"] == "Online").sum()       # count Online
    liveonline = (df["Facility"] == "Live Online").sum()   # count Live Online

    print(f"{df_name}: {online_total} online courses: {fullyonline} Fully Online, {liveonline} Live Online")

df_fall_all_online: 601 online courses: 580 Fully Online, 21 Live Online
df_spring_all_online: 619 online courses: 606 Fully Online, 13 Live Online
df_all_online: 1220 online courses: 1186 Fully Online, 34 Live Online


KDD: Unique Counts for our 3 _all DFs

In [404]:
dfs_cleaned_all = {
    "df_all": df_all,
    "df_all_online": df_all_online,
    "df_all_inperson": df_all_inperson
}

for df_semester, df in dfs_cleaned_all.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_all ===

Column: Term
Term
Spring 2025    230
Fall 2024      213
Spring 2024    206
Fall 2023      195
Fall 2021      188
Spring 2023    183
Spring 2022    174
Fall 2022      170
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    974
Seven Week - Second         298
Seven Week - First          287
Name: count, dtype: int64

Column: Session Code
Session Code
1      974
7W2    298
7W1    287
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    1025
Arizona Online                   534
Name: count, dtype: int64

Column: Facility
Facility
Online                      1186
Live Online                   34
R P Harvill Bldg, Rm 402      27
R P Harvill Bldg, Rm 428      19
R P Harvill Bldg, Rm 411      15
                            ... 
Modern Languages, Rm 214       1
R P Harvill Bldg, Rm 240       1
Education, Rm 240              1
Modern Languages, Rm 401       1
The Commons, Rm 105            1
Name: count, Length: 1

Save our DFs

In [405]:
cleaned_data_path = os.path.join(os.getcwd(), "Capstone Project Data_Cleaned")
os.makedirs(cleaned_data_path, exist_ok=True)  # Create folder if it doesn't exist

# Save the df_all DFs
for df_semester, df in dfs_cleaned.items():
    filename = os.path.join(cleaned_data_path, f"{df_semester}.xlsx") 
    df.to_excel(filename, index=False)
    print(f"Saved {df_semester}.xlsx to to Capstone Project Data_Cleaned")

# Save the In-Person/Online DFs
for df_semester, df in dfs_mode_split.items():
    filename = os.path.join(cleaned_data_path, f"{df_semester}.xlsx") 
    df.to_excel(filename, index=False)
    print(f"Saved {df_semester}.xlsx to Capstone Project Data_Cleaned")


Saved df_fall_all.xlsx to to Capstone Project Data_Cleaned
Saved df_spring_all.xlsx to to Capstone Project Data_Cleaned
Saved df_all.xlsx to to Capstone Project Data_Cleaned
Saved df_fall_all_online.xlsx to Capstone Project Data_Cleaned
Saved df_fall_all_inperson.xlsx to Capstone Project Data_Cleaned
Saved df_spring_all_online.xlsx to Capstone Project Data_Cleaned
Saved df_spring_all_inperson.xlsx to Capstone Project Data_Cleaned
Saved df_all_online.xlsx to Capstone Project Data_Cleaned
Saved df_all_inperson.xlsx to Capstone Project Data_Cleaned
