In [275]:
# Import packages
import os
import pandas as pd

# Set base path for reproducibility
# Save initial data to a folder "Capstone Project Data" in the current working directory
base_path = os.path.join(os.getcwd(), "Capstone Project Data")

# Set file paths for each semester
file_fall21 = os.path.join(base_path, "College of InfoSci_Fall 2021 - CLEANED.xlsx")
file_fall22 = os.path.join(base_path, "College of InfoSci_Fall 2022 - CLEANED.xlsx")
file_fall23 = os.path.join(base_path, "College of InfoSci_Fall 2023 - CLEANED.xlsx")
file_fall24 = os.path.join(base_path, "College of InfoSci_Fall 2024 - CLEANED.xlsx")
file_spring22 = os.path.join(base_path, "College of InfoSci_Spring 2022 - CLEANED.xlsx")
file_spring23 = os.path.join(base_path, "College of InfoSci_Spring 2023 - CLEANED.xlsx")
file_spring24 = os.path.join(base_path, "College of InfoSci_Spring 2024 - CLEANED.xlsx")
file_spring25 = os.path.join(base_path, "College of InfoSci_Spring 2025 - CLEANED.xlsx")


In [276]:
# Load each file into a separate DataFrame

# Fall DFs
df_fall21 = pd.read_excel(file_fall21)
df_fall22 = pd.read_excel(file_fall22)
df_fall23 = pd.read_excel(file_fall23)
df_fall24 = pd.read_excel(file_fall24)

# Spring DFs
df_spring22 = pd.read_excel(file_spring22)
df_spring23 = pd.read_excel(file_spring23)
df_spring24 = pd.read_excel(file_spring24)
df_spring25 = pd.read_excel(file_spring25)

Create Dictionary (dfs) for all our DataFrames. Will use the Dictionary for data cleansing with for-loops

In [277]:
# Dictionary of all our DataFrames
dfs = {
    "df_fall21": df_fall21,
    "df_fall22": df_fall22,
    "df_fall23": df_fall23,
    "df_fall24": df_fall24,
    "df_spring22": df_spring22,
    "df_spring23": df_spring23,
    "df_spring24": df_spring24,
    "df_spring25": df_spring25
}

In [278]:
# Check data types from first DF
print(df_fall21.dtypes)

Term                            object
Session                         object
Session Code                    object
Campus                          object
Facility                        object
Room Capacity                    int64
Subject Code                    object
Catalog Number                  object
Class Section                   object
Req Desig                       object
Component                       object
Min Units                        int64
Max Units                        int64
Combined Section                object
Enrollment Status               object
Instruction Mode                object
Class Number                     int64
Course Description              object
Class Status Code               object
Start Date              datetime64[ns]
End Date                datetime64[ns]
Class Meeting Number           float64
Meeting Days                    object
Meeting Time Start              object
Meeting Time End                object
Total Enroll             

In [279]:
# Check data types for each DF

for df_semester, df in dfs.items():
    print(f"\nData types for {df_semester}:")
    print(df.dtypes)


Data types for df_fall21:
Term                            object
Session                         object
Session Code                    object
Campus                          object
Facility                        object
Room Capacity                    int64
Subject Code                    object
Catalog Number                  object
Class Section                   object
Req Desig                       object
Component                       object
Min Units                        int64
Max Units                        int64
Combined Section                object
Enrollment Status               object
Instruction Mode                object
Class Number                     int64
Course Description              object
Class Status Code               object
Start Date              datetime64[ns]
End Date                datetime64[ns]
Class Meeting Number           float64
Meeting Days                    object
Meeting Time Start              object
Meeting Time End                objec

Each dataframe has the same data types

Update Data Types into Integer: Room Capacity, Min Units, Max Units, Class Number, Class Meeting Number

In [None]:
# Columns to convert to Int64
columns_to_change = ["Room Capacity", "Min Units", "Max Units", "Class Number", "Class Meeting Number"]

# Iterate through our dictionary (dfs), defined above
for df_semester, df in dfs.items():
    df[columns_to_change] = df[columns_to_change].astype("Int64")    # Convert to Int64
    dfs[df_semester] = df                                            # Update dictionary
    print(f"{df_semester}: Selected columns converted to Int64")

# Columns to convert to datetime64[ns]
columns_to_change_time = ["Meeting Time Start", "Meeting Time End"]
print("\n")

for df_semester, df in dfs.items():
    for col in columns_to_change_time:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce').astype("datetime64[ns]")
    dfs[df_semester] = df  # Update dictionary
    print(f"{df_semester}: Start/End Time columns converted to datetime64[ns]")

# Double check if data updated
# print(df_fall21.dtypes)

df_fall21: Selected columns converted to Int64
df_fall22: Selected columns converted to Int64
df_fall23: Selected columns converted to Int64
df_fall24: Selected columns converted to Int64
df_spring22: Selected columns converted to Int64
df_spring23: Selected columns converted to Int64
df_spring24: Selected columns converted to Int64
df_spring25: Selected columns converted to Int64


df_fall21: Start/End Time columns converted to datetime64[ns]
df_fall22: Start/End Time columns converted to datetime64[ns]
df_fall23: Start/End Time columns converted to datetime64[ns]
df_fall24: Start/End Time columns converted to datetime64[ns]
df_spring22: Start/End Time columns converted to datetime64[ns]
df_spring23: Start/End Time columns converted to datetime64[ns]
df_spring24: Start/End Time columns converted to datetime64[ns]
df_spring25: Start/End Time columns converted to datetime64[ns]
Term                            object
Session                         object
Session Code                    

Remove unnecessary data:
1) Component > Independent Study
2) Class Status Code > T
3) Req Desig > HONR

In [281]:
for df_semester, df in dfs.items():

    # Count how many rows match each condition - to print once removed
    original_rowcount = len(df)              # Original - to subtract from total
    count_independent = (df["Component"] == "Independent Study").sum()
    count_t = (df["Class Status Code"] == "T").sum()
    count_honor = (df["Req Desig"] == "HONR").sum()

    # Remove rows where ANY of the 3 conditions are True
    df_cleaned = df[
        ~((df["Component"] == "Independent Study") |
          (df["Class Status Code"] == "T") |
          (df["Req Desig"] == "HONR"))]

    dfs[df_semester] = df_cleaned
    new_count = len(df_cleaned)
    total_removed = original_rowcount - new_count

    # Print what was removed
    print(f"{df_semester}:")
    print(f"Removed {count_independent} Independent Study rows")
    print(f"Removed {count_t} T-status rows")
    print(f"Removed {count_honor} HONR rows")
    print(f"Total removed: {total_removed} rows\n")


df_fall21:
Removed 63 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 63 rows

df_fall22:
Removed 72 Independent Study rows
Removed 0 T-status rows
Removed 5 HONR rows
Total removed: 72 rows

df_fall23:
Removed 101 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 101 rows

df_fall24:
Removed 95 Independent Study rows
Removed 2 T-status rows
Removed 7 HONR rows
Total removed: 97 rows

df_spring22:
Removed 69 Independent Study rows
Removed 0 T-status rows
Removed 6 HONR rows
Total removed: 69 rows

df_spring23:
Removed 80 Independent Study rows
Removed 0 T-status rows
Removed 8 HONR rows
Total removed: 80 rows

df_spring24:
Removed 85 Independent Study rows
Removed 2 T-status rows
Removed 9 HONR rows
Total removed: 87 rows

df_spring25:
Removed 96 Independent Study rows
Removed 12 T-status rows
Removed 8 HONR rows
Total removed: 108 rows



Check for Missing Data in each DataFrame

In [282]:
# Check for missing values 

# Empty dictionary to store the results
missing_data_check = {}

for df_semester, df in dfs.items():
    missing_total = df.isnull().sum().sum() # Total missing values; 1st sum() sums columns, 2nd sums rows
    if missing_total > 0:                   # If missing values found, add to our empty dictionary (missing_data_check)
        missing_data_check[df_semester] = missing_total  # missing_total by df_semester adding to our missing_data_check dictionary

# Print results
if missing_data_check:
    print("DataFrames with missing values:")
    # Iterate through our missing values dictionary (missing_data_check)
    for df_semester, missing_sum in missing_data_check.items():
        print(f"{df_semester}: {missing_sum} missing values")
else:
    print("No missing data found!")

DataFrames with missing values:
df_fall21: 570 missing values
df_fall22: 550 missing values
df_fall23: 628 missing values
df_fall24: 672 missing values
df_spring22: 544 missing values
df_spring23: 596 missing values
df_spring24: 644 missing values
df_spring25: 724 missing values


Fall21, Fall22, and Spring22 all have missing data

Explore what is missing in them:

In [283]:
#Explore which data is missing in Fall21, Fall22, Spring22; make new dictionary for missing data
dfs_missing = {
    "df_fall21": df_fall21,
    "df_fall22": df_fall22,
    "df_spring22": df_spring22}

for df_semester, df in dfs_missing.items():
    if df.isnull().sum().sum() > 0:                 # If there total sum of missing values found in the entire df > 0
        print(f"\nMissing data in {df_semester}:")
        print(df.isnull().sum())                    # Print each column & whether there is missing values by column


Missing data in df_fall21:
Term                      0
Session                   0
Session Code              0
Campus                    0
Facility                  0
Room Capacity             0
Subject Code              0
Catalog Number            0
Class Section             0
Req Desig                 0
Component                 0
Min Units                 0
Max Units                 0
Combined Section          0
Enrollment Status         0
Instruction Mode          0
Class Number              0
Course Description        0
Class Status Code         0
Start Date                1
End Date                  1
Class Meeting Number      1
Meeting Days              1
Meeting Time Start      346
Meeting Time End        346
Total Enroll              0
Enrollment Capacity       0
dtype: int64

Missing data in df_fall22:
Term                      0
Session                   0
Session Code              0
Campus                    0
Facility                  0
Room Capacity             0
Subject

The Missing Data is the same 6 columns for each of the 3 dataframes (Fall21, Fall22, Spring22):

Start Date              1

End Date                1

Class Meeting Number    1

Meeting Days            1

Meeting Time Start      1

Meeting Time End        1

Replace the Missing Data with Default Values

In [284]:
# Replace the missing data with default values (manual, without For Loop)

# Define what we will insert, by DF
default_values_fall21 = {
    "Start Date": pd.Timestamp("2021-08-23 00:00:00"),  #match datetime64[ns] format
    "End Date": pd.Timestamp("2021-12-08 00:00:00"),
    "Class Meeting Number": 1,
    "Meeting Days": "-",
    "Meeting Time Start": "00:00:00",
    "Meeting Time End": "00:00:00"}

default_values_fall22 = {
    "Start Date": pd.Timestamp("2022-08-22 00:00:00"),
    "End Date": pd.Timestamp("2022-12-07 00:00:00"),
    "Class Meeting Number": 1,
    "Meeting Days": "-",
    "Meeting Time Start": "00:00:00",
    "Meeting Time End": "00:00:00"}

default_values_spring22 = {
    "Start Date": pd.Timestamp("2022-01-12 00:00:00"),
    "End Date": pd.Timestamp("2022-05-04 00:00:00"),
    "Class Meeting Number": 1,
    "Meeting Days": "-",
    "Meeting Time Start": "00:00:00",
    "Meeting Time End": "00:00:00"}

#If the cell is "missing" and the Session is "Regular Academic Session", insert the default values defined above
for col, default in default_values_fall21.items():
    df_fall21.loc[(df["Session"] == "Regular Academic Session") & (df_fall21[col].isnull()), col] = default

for col, default in default_values_fall22.items():
    df_fall22.loc[(df["Session"] == "Regular Academic Session") & (df_fall22[col].isnull()), col] = default

for col, default in default_values_spring22.items():
    df_spring22.loc[(df_spring22["Session"] == "Regular Academic Session") & (df_spring22[col].isnull()), col] = default

In [285]:
# Confirm if the missing data was fixed
# Re-run the same For Loop (with dfs_missing) that originally found the missing data
for df_semester, df in dfs_missing.items():
    if df.isnull().sum().sum() > 0:
        print(f"\nMissing data in {df_semester}:")
        print(df.isnull().sum())
else: 
    print("Missing values fixed!")



Missing data in df_fall21:
Term                     0
Session                  0
Session Code             0
Campus                   0
Facility                 0
Room Capacity            0
Subject Code             0
Catalog Number           0
Class Section            0
Req Desig                0
Component                0
Min Units                0
Max Units                0
Combined Section         0
Enrollment Status        0
Instruction Mode         0
Class Number             0
Course Description       0
Class Status Code        0
Start Date               0
End Date                 0
Class Meeting Number     0
Meeting Days             0
Meeting Time Start      89
Meeting Time End        89
Total Enroll             0
Enrollment Capacity      0
dtype: int64

Missing data in df_fall22:
Term                     0
Session                  0
Session Code             0
Campus                   0
Facility                 0
Room Capacity            0
Subject Code             0
Catalog Numbe

Missing values fixed

Next, look for In-Person Classes without Dates or Start/End Times

In [286]:
#Check In-Person Classes without Dates or Start/End Times

days_times_columns = ["Meeting Days", "Meeting Time Start", "Meeting Time End"]

# List to store missing data results & which columns to show in the final summary
filtered_rows = []
filtered_columns_to_show = ["Component", "Class Number", "Course Description", "Meeting Days", "Meeting Time Start", "Meeting Time End"]

# Empty dictionary to store missing data report
missing_data_report = {}

# Iterate through each DataFrame in our dictionary "dfs"
for df_semester, df in dfs.items():
    condition = (                                   # Conditions: In Person & not Independent Study           
        (df["Instruction Mode"] == "In Person") &  
        (df["Component"] != "Independent Study")) 
    
    # Find missing observations ("-") in our days_times_columns columns
    missing_days_times = df[condition & df[days_times_columns].isin(["-"]).any(axis=1)] # .any() for any column with "-", not just 3 for 3
    
    # If missing data exists ("-"), store in missing_data_report
    if not missing_days_times.empty:
        missing_data_report[df_semester] = missing_days_times.copy()

# Print results
if missing_data_report:
    for df_semester, df_missing in missing_data_report.items():
        if isinstance(df_missing, pd.DataFrame):    # Confirm df_missing is a DataFrame before data manipulation
            #Set up our empty list (filtered_rows) to store the rows with missing data in our selected columns
            selected_data = df_missing[filtered_columns_to_show].copy() # Create new DF with our selected columns
            selected_data.insert(0, "DataFrame Name", df_semester)      # Insert new column for the Dataframe Name
            filtered_rows.append(selected_data)                         # Add to list filtered_rows by appending selected_data

    # Print results
    if filtered_rows:
        missing_data_summary = pd.concat(filtered_rows, ignore_index = True) # Transform our list (filtered_rows) to DF
    
        print("\nIn-Person Classes without Dates or Start/End Times:")
        print(missing_data_summary)
        

else:
    print("No missing schedule info found!")



In-Person Classes without Dates or Start/End Times:
  DataFrame Name   Component  Class Number         Course Description  \
0      df_fall22  Laboratory         67885  Designing an Installation   
1    df_spring22  Laboratory         95553  Designing an Installation   

  Meeting Days Meeting Time Start Meeting Time End  
0            -                NaT              NaT  
1            -                NaT              NaT  


^ In-Person Classes without Dates or Start/End Times

Decided to Delete them

In [287]:
#Delete the Classes without Dates or Start/End Times

for df_semester, df in dfs.items():
    df = dfs[df_semester]
    
    # Find those missing Classes again
    condition = (
        (df["Instruction Mode"] == "In Person") &
        (df["Component"] != "Independent Study") &
        (df[days_times_columns].isin(["-"]).any(axis=1))) # Searching for blanks here now
    
    # Drop the rows that meet the condition
    dropped_count = df[condition].shape[0]                # Count dropped rows before deleting
    df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows
    dfs[df_semester] = df

    # Print how many rows were deleted
    if dropped_count > 0:
        print(f"{dropped_count} rows removed from {df_semester}.")
    else:
        print(f"No rows removed from {df_semester}.")


No rows removed from df_fall21.
1 rows removed from df_fall22.
No rows removed from df_fall23.
No rows removed from df_fall24.
1 rows removed from df_spring22.
No rows removed from df_spring23.
No rows removed from df_spring24.
No rows removed from df_spring25.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

Rows removed

(?? I wish the SettingWithCopyWarning: warnings weren't happening)

Run it again to check is In-Person Classes without Days/Times was fixed

In [288]:
"""
for df_semester, df in dfs.items():
    df = dfs[df_semester]
    
    # Find those missing Classes again
    condition = (
        (df["Instruction Mode"] == "In Person") &
        (df["Component"] != "Independent Study") &
        (df[days_times_columns].isin(["-"]).any(axis=1))) # Searching for blanks here now
    
    # Drop the rows that meet the condition
    dropped_count = df[condition].shape[0]                # Count dropped rows before deleting
    df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows
    dfs[df_semester] = df

    # Print how many rows were deleted
    if dropped_count > 0:
        print(f"{dropped_count} rows removed from {df_semester}.")
    else:
        print(f"No rows removed from {df_semester}.")

"""

'\nfor df_semester, df in dfs.items():\n    df = dfs[df_semester]\n    \n    # Find those missing Classes again\n    condition = (\n        (df["Instruction Mode"] == "In Person") &\n        (df["Component"] != "Independent Study") &\n        (df[days_times_columns].isin(["-"]).any(axis=1))) # Searching for blanks here now\n    \n    # Drop the rows that meet the condition\n    dropped_count = df[condition].shape[0]                # Count dropped rows before deleting\n    df.drop(index=df.loc[condition].index, inplace=True)  # Delete the rows\n    dfs[df_semester] = df\n\n    # Print how many rows were deleted\n    if dropped_count > 0:\n        print(f"{dropped_count} rows removed from {df_semester}.")\n    else:\n        print(f"No rows removed from {df_semester}.")\n\n'

In-Person Classes without Dates or Start/End Times Fixed

KDD: Print Each DF's unique values by column & their respective counts

In [289]:
for df_semester, df in dfs.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)



=== DataFrame: df_fall21 ===

Column: Term
Term
Fall 2021    283
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    214
Seven Week - First           39
Seven Week - Second          30
Name: count, dtype: int64

Column: Session Code
Session Code
1      214
7W1     39
7W2     30
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    189
Arizona Online                   94
Name: count, dtype: int64

Column: Facility
Facility
Online                        211
Live Online                    15
Elec & Comp Engr, Rm 107       12
R P Harvill Bldg, Rm 460        7
Haury Anthro Bldg, Rm 216       4
Modern Languages, Rm 311        4
R P Harvill Bldg, Rm 130        4
R P Harvill Bldg, Rm 402        3
R P Harvill Bldg, Rm 415        2
R P Harvill Bldg, Rm 401        2
R P Harvill Bldg, Rm 319        2
Aero & Mech Engr, Rm S324       2
R P Harvill Bldg, Rm 232        2
R P Harvill Bldg, Rm 134        2
R P Harvill Bldg, Rm 204        1
R P Harvi

Term
Fall 2024    336
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    236
Seven Week - First           52
Seven Week - Second          48
Name: count, dtype: int64

Column: Session Code
Session Code
1      236
7W1     52
7W2     48
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    216
Arizona Online                  120
Name: count, dtype: int64

Column: Facility
Facility
Online                        240
R P Harvill Bldg, Rm 402        7
Live Online                     6
R P Harvill Bldg, Rm 460        6
M Pacheco ILC, Rm 129           4
R P Harvill Bldg, Rm 428        3
R P Harvill Bldg, Rm 413        3
R P Harvill Bldg, Rm 411        3
R P Harvill Bldg, Rm 208        3
Social Sciences, Rm 206         3
Harshbarger Bldg, Rm 206        2
Phys-Atmos Sci, Rm 224          2
C E Chavez Bldg, Rm 405         2
R P Harvill Bldg, Rm 415        2
Chemistry, Rm 205               2
Civil Engineering, Rm 201       2
R P Harvill Bldg, R

KDD: Explore Co-Convened Classes

In [290]:
for df_semester, df in dfs.items():
    # Find for co-convened courses
    co_convened_rows = df[df["Combined Section"] != "-"] # Any value besides "-" is co-convened

    # Count total number of co-convened courses (rows) & how many are unique ("combined section")
    co_convened_total = co_convened_rows.shape[0]
    co_convened_unique = co_convened_rows["Combined Section"].nunique()

    print(f"{df_semester}: {co_convened_total} co-convened courses, {co_convened_unique} unique co-convened course groups (Combined Sections)")


df_fall21: 223 co-convened courses, 95 unique co-convened course groups (Combined Sections)
df_fall22: 222 co-convened courses, 88 unique co-convened course groups (Combined Sections)
df_fall23: 251 co-convened courses, 103 unique co-convened course groups (Combined Sections)
df_fall24: 261 co-convened courses, 104 unique co-convened course groups (Combined Sections)
df_spring22: 219 co-convened courses, 92 unique co-convened course groups (Combined Sections)
df_spring23: 244 co-convened courses, 92 unique co-convened course groups (Combined Sections)
df_spring24: 267 co-convened courses, 108 unique co-convened course groups (Combined Sections)
df_spring25: 281 co-convened courses, 112 unique co-convened course groups (Combined Sections)


Now that data is cleaned, Combine into Fall, Spring, and All DataFrames

In [291]:
#Combined DataFrames: Fall, Spring, and All
df_fall_all = pd.concat(
    [dfs["df_fall21"], dfs["df_fall22"], dfs["df_fall23"], dfs["df_fall24"]],
    ignore_index=True)

df_spring_all = pd.concat(
    [dfs["df_spring22"], dfs["df_spring23"], dfs["df_spring24"], dfs["df_spring25"]],
    ignore_index=True)

df_all = pd.concat([df_fall_all, df_spring_all], ignore_index=True)

dfs_cleaned = {
    "df_fall_all": df_fall_all,
    "df_spring_all": df_spring_all,
    "df_all": df_all
    }

In [292]:
#Save DataFrames to Excel (.xlsx)

for df_semester, df in dfs_cleaned.items():
    filename = f"{df_semester}.xlsx"        # Save file name
    df.to_excel(filename, index=False)      # Save to Excel
    print(f"Saved {df_semester} to {filename}")


Saved df_fall_all to df_fall_all.xlsx
Saved df_spring_all to df_spring_all.xlsx
Saved df_all to df_all.xlsx


KDD: Print Unique Values by Columns again, now for our combined dataframes

In [293]:
for df_semester, df in dfs_cleaned.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all ===

Column: Term
Term
Fall 2024    336
Fall 2023    314
Fall 2021    283
Fall 2022    272
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    850
Seven Week - First          183
Seven Week - Second         172
Name: count, dtype: int64

Column: Session Code
Session Code
1      850
7W1    183
7W2    172
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    770
Arizona Online                  435
Name: count, dtype: int64

Column: Facility
Facility
Online                      922
Live Online                  27
R P Harvill Bldg, Rm 402     23
R P Harvill Bldg, Rm 460     22
Elec & Comp Engr, Rm 107     14
                           ... 
Modern Languages, Rm 413      1
Social Sciences, Rm 222       1
R P Harvill Bldg, Rm 102      1
R P Harvill Bldg, Rm 105      1
C E Chavez Bldg, Rm 109       1
Name: count, Length: 90, dtype: int64

Column: Room Capacity
Room Capacity
1      953
40      34
31      24
44    

Make more DataFrames: 1) Undergrad vs Graduate, 2) Online vs In-Person

In [294]:
#New DFs: Undergrad vs Grad

# empty dictionary for ugrad vs grad
dfs_split = {}

# Iterate through our cleaned DataFrames' dictionary (dfs_cleaned)
for name, df in dfs_cleaned.items():
    # Extract First Digit from "Catalog Number" column, convert to float
    first_digit = df["Catalog Number"].astype(str).str[0].str.extract(r"(\d)").astype(float)

    # Add a temporary column to store the extracted digit
    df = df.copy()
    df["Catalog_First_Digit"] = first_digit

    # Split Undergrad vs Grad based on the float first digit <5 (ugrad) or >= 5 (grad)
    df_ugrad = df[df["Catalog_First_Digit"] < 5].copy()
    df_grad = df[df["Catalog_First_Digit"] >= 5].copy()

    # Store Ugrad & Grad DFs in dictionary dfs_split
    dfs_split[f"{name}_ugrad"] = df_ugrad
    dfs_split[f"{name}_grad"] = df_grad


In [295]:
# Print name & numbers of rows/columns by each Ugrad & Grad DF
for name, df in dfs_split.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")

df_fall_all_ugrad: 766 rows, 28 columns
df_fall_all_grad: 439 rows, 28 columns
df_spring_all_ugrad: 793 rows, 28 columns
df_spring_all_grad: 458 rows, 28 columns
df_all_ugrad: 1559 rows, 28 columns
df_all_grad: 897 rows, 28 columns


In [296]:
# Make Jupyter Variables for our 6 new Ugrad/Grad DFs
for name, df in dfs_split.items():
    globals()[name] = df


In [297]:
#Save Ugrad/Grad DataFrames to Excel (.xlsx) (dictionary dfs_split)

for df_semester, df in dfs_split.items():
    filename = f"{df_semester}.xlsx"        # Save file name
    df.to_excel(filename, index=False)      # Save to Excel
    print(f"Saved {df_semester} to {filename}")

Saved df_fall_all_ugrad to df_fall_all_ugrad.xlsx
Saved df_fall_all_grad to df_fall_all_grad.xlsx
Saved df_spring_all_ugrad to df_spring_all_ugrad.xlsx
Saved df_spring_all_grad to df_spring_all_grad.xlsx
Saved df_all_ugrad to df_all_ugrad.xlsx
Saved df_all_grad to df_all_grad.xlsx


KDD: Print Unique Values by Columns again, now for our Ugrad/Grad combined dataframes

In [298]:
for df_semester, df in dfs_split.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all_ugrad ===

Column: Term
Term
Fall 2024    213
Fall 2023    195
Fall 2021    188
Fall 2022    170
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    490
Seven Week - First          142
Seven Week - Second         134
Name: count, dtype: int64

Column: Session Code
Session Code
1      490
7W1    142
7W2    134
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    497
Arizona Online                  269
Name: count, dtype: int64

Column: Facility
Facility
Online                        580
Live Online                    21
R P Harvill Bldg, Rm 402       13
R P Harvill Bldg, Rm 428        8
Elec & Comp Engr, Rm 107        8
                             ... 
C E Chavez Bldg, Rm 303         1
Communication, Rm 214           1
Education, Rm 535               1
Mines & Metallurgy, Rm 213      1
Social Sciences, Rm 206         1
Name: count, Length: 73, dtype: int64

Column: Room Capacity
Room Capacity
1      605


Course Description
Database Dev And Mgmt             28
Introduction To Archives          24
Organization/Information          22
Ethical Issues in Information     20
Social Justice in Info Service    18
Intro Info Technology             16
Intro Digital Curation/Preserv    16
Intro Applied Technology          16
Data Analysis and Visualizatio    15
Data Mining/Discovery             15
Science Information               14
Found Libr+Info Services          12
Rsrch Mth/Libr+Info Prof          12
Intellectual Property/Copyrigh    12
Intro: Human Computer Interact    10
Intro to Machine Learning         10
Digital Info Mgmt Capstn           8
Data Warehousing in the Cloud      8
Mgmt for Info Professionals        8
Managing the Information Org       8
Game Development                   8
Government Information             8
Early Chlhd+Public Librs           8
Cataloging+Metadata Mgmt           8
Documnt Divrs Cult+Comms           8
Mktng Library+Info Srvcs           8
Neural Networks    

New Data Frames: In-Person vs Online 

(from dfs_cleaned: df_all, df_fall_all, df_spring_all; can revisit splitting grad/ugrad by in-person vs online if necessary)

In [299]:
# In-Person vs Online DataFrames

# Empty dictionary to store the new split (In-Person/Online) DFs
dfs_mode_split = {}

# Loop through the cleaned DataFrames in dict dfs_cleaned
for name, df in dfs_cleaned.items():
    df = df.copy()

    # Online Vs In-Person: Defined by values in "Facility" (have more options if this)
    df_online = df[df["Facility"] == "Online"].copy()
    df_inperson = df[df["Facility"] != "Online"].copy()

    # Split our original DFs into new DFs: In-Person & Online
    dfs_mode_split[f"{name}_online"] = df_online
    dfs_mode_split[f"{name}_inperson"] = df_inperson


In [300]:
# Print name & numbers of rows/columns by each In-Person/Online DF
for name, df in dfs_mode_split.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")

#Save In-Person/Online DataFrames to Excel (.xlsx)
for df_semester, df in dfs_mode_split.items():
    filename = f"{df_semester}.xlsx"        # Save file name
    df.to_excel(filename, index=False)      # Save to Excel
    print(f"Saved {df_semester} to {filename}")

df_fall_all_online: 922 rows, 27 columns
df_fall_all_inperson: 283 rows, 27 columns
df_spring_all_online: 987 rows, 27 columns
df_spring_all_inperson: 264 rows, 27 columns
df_all_online: 1909 rows, 27 columns
df_all_inperson: 547 rows, 27 columns
Saved df_fall_all_online to df_fall_all_online.xlsx
Saved df_fall_all_inperson to df_fall_all_inperson.xlsx
Saved df_spring_all_online to df_spring_all_online.xlsx
Saved df_spring_all_inperson to df_spring_all_inperson.xlsx
Saved df_all_online to df_all_online.xlsx
Saved df_all_inperson to df_all_inperson.xlsx


In [301]:
# Make Jupyter Variables for our 6 new In-Person/Online DataFrames
for name, df in dfs_mode_split.items():
    globals()[name] = df


In [302]:
#Save In-Person/Online DataFrames to Excel (.xlsx)

for df_semester, df in dfs_mode_split.items():
    filename = f"{df_semester}.xlsx"        # Save file name
    df.to_excel(filename, index=False)      # Save to Excel
    print(f"Saved {df_semester} to {filename}")

Saved df_fall_all_online to df_fall_all_online.xlsx
Saved df_fall_all_inperson to df_fall_all_inperson.xlsx
Saved df_spring_all_online to df_spring_all_online.xlsx
Saved df_spring_all_inperson to df_spring_all_inperson.xlsx
Saved df_all_online to df_all_online.xlsx
Saved df_all_inperson to df_all_inperson.xlsx


KDD: Print Unique Values & Counts for New In-Person/Online DFs

In [303]:
for df_semester, df in dfs_mode_split.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all_online ===

Column: Term
Term
Fall 2023    249
Fall 2024    240
Fall 2022    222
Fall 2021    211
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    567
Seven Week - First          183
Seven Week - Second         172
Name: count, dtype: int64

Column: Session Code
Session Code
1      567
7W1    183
7W2    172
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    488
Arizona Online                  434
Name: count, dtype: int64

Column: Facility
Facility
Online    922
Name: count, dtype: int64

Column: Room Capacity
Room Capacity
1    922
Name: count, dtype: Int64

Column: Subject Code
Subject Code
ESOC    236
LIS     224
ISTA    222
INFO    178
GAME     62
Name: count, dtype: int64

Column: Catalog Number
Catalog Number
302    34
130    32
570    28
116    28
301    23
       ..
588     2
421     2
511     2
581     2
535     1
Name: count, Length: 93, dtype: int64

Column: Class Section
Class Section
10

More DFs: Grad/Ugrad by Online/In Person

In [304]:
# New dictionary to store the expanded splits
dfs_split_by_gradandmode = {}

# Loop through each ugrad/grad DataFrame
for name, df in dfs_split.items():

    # Online rows
    df_online = df[df["Facility"] == "Online"].copy()
    dfs_split_by_gradandmode[f"{name}_online"] = df_online

    # In-person rows
    df_inperson = df[df["Facility"] != "Online"].copy()
    dfs_split_by_gradandmode[f"{name}_inperson"] = df_inperson

# Print name & numbers of rows/columns by each In-Person/Online DF
for name, df in dfs_split_by_gradandmode.items():
    print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")

# Make Jupyter Variables for our new Grad/Ugrad + In-Person/Online DFs
for name, df in dfs_split_by_gradandmode.items():
    globals()[name] = df


df_fall_all_ugrad_online: 580 rows, 28 columns
df_fall_all_ugrad_inperson: 186 rows, 28 columns
df_fall_all_grad_online: 342 rows, 28 columns
df_fall_all_grad_inperson: 97 rows, 28 columns
df_spring_all_ugrad_online: 606 rows, 28 columns
df_spring_all_ugrad_inperson: 187 rows, 28 columns
df_spring_all_grad_online: 381 rows, 28 columns
df_spring_all_grad_inperson: 77 rows, 28 columns
df_all_ugrad_online: 1186 rows, 28 columns
df_all_ugrad_inperson: 373 rows, 28 columns
df_all_grad_online: 723 rows, 28 columns
df_all_grad_inperson: 174 rows, 28 columns


In [305]:
#Save In-Person/Online DataFrames to Excel (.xlsx)

for df_semester, df in dfs_split_by_gradandmode.items():
    filename = f"{df_semester}.xlsx"        # Save file name
    df.to_excel(filename, index=False)      # Save to Excel
    print(f"Saved {df_semester} to {filename}")

Saved df_fall_all_ugrad_online to df_fall_all_ugrad_online.xlsx
Saved df_fall_all_ugrad_inperson to df_fall_all_ugrad_inperson.xlsx
Saved df_fall_all_grad_online to df_fall_all_grad_online.xlsx
Saved df_fall_all_grad_inperson to df_fall_all_grad_inperson.xlsx
Saved df_spring_all_ugrad_online to df_spring_all_ugrad_online.xlsx
Saved df_spring_all_ugrad_inperson to df_spring_all_ugrad_inperson.xlsx
Saved df_spring_all_grad_online to df_spring_all_grad_online.xlsx
Saved df_spring_all_grad_inperson to df_spring_all_grad_inperson.xlsx
Saved df_all_ugrad_online to df_all_ugrad_online.xlsx
Saved df_all_ugrad_inperson to df_all_ugrad_inperson.xlsx
Saved df_all_grad_online to df_all_grad_online.xlsx
Saved df_all_grad_inperson to df_all_grad_inperson.xlsx


KDD: Counts for new Grad/Ugrad + In-Person/Online DFs

In [306]:
for df_semester, df in dfs_split_by_gradandmode.items():
    print(f"\n=== DataFrame: {df_semester} ===")
    for col in df.columns:
        print(f"\nColumn: {col}")
        value_counts = df[col].value_counts(dropna=False)
        print(value_counts)


=== DataFrame: df_fall_all_ugrad_online ===

Column: Term
Term
Fall 2023    153
Fall 2024    147
Fall 2021    142
Fall 2022    138
Name: count, dtype: int64

Column: Session
Session
Regular Academic Session    304
Seven Week - First          142
Seven Week - Second         134
Name: count, dtype: int64

Column: Session Code
Session Code
1      304
7W1    142
7W2    134
Name: count, dtype: int64

Column: Campus
Campus
University of Arizona - Main    311
Arizona Online                  269
Name: count, dtype: int64

Column: Facility


Facility
Online    580
Name: count, dtype: int64

Column: Room Capacity
Room Capacity
1    580
Name: count, dtype: Int64

Column: Subject Code
Subject Code
ESOC    236
ISTA    222
GAME     62
LIS      46
INFO     14
Name: count, dtype: int64

Column: Catalog Number
Catalog Number
302      34
130      32
116      28
301      23
300      20
314      20
150B1    19
321      16
161      16
317      16
470      14
322      14
351      14
315      12
311      12
263      12
251      12
316      12
313      12
213      12
420      10
308      10
210      10
131      10
480      10
212      10
340      10
431       8
318       8
472       8
330       8
310       8
320       8
432       8
471       8
211       8
100       7
478       7
230       6
424       6
477       6
484       6
331       5
350       5
307       4
303       4
214       4
452       4
416       4
319       4
329       4
402       4
457       2
475       2
309       2
421       2
Name: count, dtype: int64

Column: Class Sectio