In [1]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 
from glob import glob   


In [2]:
# Create a dataframe with the grading information
data = {
    "Letter Grade": ["A+", "A", "A-", "B+", "B", "C+", "C", "U"],
    "4-Point Equivalence": [4.30, 4.00, 3.67, 3.33, 3.00, 2.33, 2.00, 0.00],
    "Grading Range": ["97 - 100", "94 - 96.99", "90 - 93.99", "87 - 89.99", "80 - 86.99", "77 - 79.99", "70 - 76.99", "<70"]
}

df = pd.DataFrame(data)

# Export the dataframe to a CSV file
csv_file_path = "grading_scale.csv"
df.to_csv(csv_file_path, index=False)

csv_file_path

'grading_scale.csv'

In [3]:
def process_csv_files(base_path, output_path):
    os.makedirs(output_path, exist_ok=True)
    subdirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    
    for subdir in subdirs:
        # Get all CSV files in the current subdirectory
        csv_files = glob(os.path.join(base_path, subdir, '*.csv'))
        
        if not csv_files:
            print(f"No CSV files found in {os.path.join(base_path, subdir)}")
            continue

        # Initialize an empty list to store DataFrames for this subdirectory
        subdir_dataframes = []

        # Read the first CSV file to get the correct column headers
        first_file = csv_files[0]
        initial_df = pd.read_csv(first_file)
        correct_columns = initial_df.columns
        subdir_dataframes.append(initial_df)
        print(f"Read {first_file} successfully with {len(initial_df)} rows.")

        # Process the rest of the files
        for file in csv_files[1:]:
            try:
                df = pd.read_csv(file, header=None, names=correct_columns) # Read the file with the correct columns
                df.columns = correct_columns
                
                # Check if the columns match
                if list(df.columns) != list(correct_columns):
                    print(f"Warning: Columns in {file} do not match the expected columns. Skipping this file.")
                    continue
                
                subdir_dataframes.append(df)
                print(f"Read {file} successfully with {len(df)} rows.")
            except Exception as e:
                print(f"Error reading {file}: {str(e)}")

        # Combine all DataFrames for this subdirectory
        combined_df = pd.concat(subdir_dataframes, ignore_index=True)
        
        # Save the combined DataFrame for this subdirectory directly in the output_path
        output_file = os.path.join(output_path, f"{subdir}_combined.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Processed all CSV files in {subdir}. Total rows: {len(combined_df)}. Saved to {output_file}")

    print("Finished processing all subdirectories.")

# Usage
base_path = 'data'
output_path = 'combined_data'
process_csv_files(base_path, output_path)

Read data\C100\C100_SEC01.csv successfully with 59 rows.
Read data\C100\C100_SEC02.csv successfully with 60 rows.
Read data\C100\C100_SEC04.csv successfully with 60 rows.
Read data\C100\C100_SEC05.csv successfully with 60 rows.
Read data\C100\C100_SEC06.csv successfully with 61 rows.
Read data\C100\C100_SEC07.csv successfully with 59 rows.
Read data\C100\C100_SEC08.csv successfully with 60 rows.
Read data\C100\C100_SEC09.csv successfully with 60 rows.
Read data\C100\C100_SEC10.csv successfully with 59 rows.
Read data\C100\C100_SEC11.csv successfully with 60 rows.
Read data\C100\C100_SEC12.csv successfully with 59 rows.
Read data\C100\C100_SEC13.csv successfully with 60 rows.
Read data\C100\C100_SEC14.csv successfully with 60 rows.
Read data\C100\C100_SEC15.csv successfully with 61 rows.
Read data\C100\C100_SEC16.csv successfully with 60 rows.
Read data\C100\C100_SEC17.csv successfully with 60 rows.
Read data\C100\C100_SEC18.csv successfully with 60 rows.
Read data\C100\C100_SEC19.csv s

In [4]:
def load_grading_scale(file_path):
    grading_df = pd.read_csv(file_path)
    grade_map = dict(zip(grading_df['Letter Grade'], grading_df['4-Point Equivalence']))
    return grade_map

def convert_grade(value, grade_map):
    if isinstance(value, str):
        return grade_map.get(value.upper(), value)
    return value

def clean_and_process_data(input_path, output_path, grading_scale_path):
    os.makedirs(output_path, exist_ok=True)
    grade_map = load_grading_scale(grading_scale_path)
    
    csv_files = glob(os.path.join(input_path, '*_combined.csv')) # Using glob function directly

    for file in csv_files:
        print(f"Processing file: {file}")
        
        df = pd.read_csv(file)
        df.dropna(how='all', inplace=True)

        if 'learner_id' not in df.columns:
            print(f"Error: 'learner_id' column not found in {file}.")
            print("Available columns:", df.columns.tolist())
            continue

        df['team'] = pd.to_numeric(df['learner_id'].str[:2], errors='coerce').astype('Int64')
        df['section'] = df['learner_id'].str[-1]

        if 'last_name' in df.columns and 'first_name' in df.columns:
            df['base_id'] = (df['last_name'].str[0].fillna('') + 
                             df['first_name'].str[0].fillna('') + 
                             df['learner_id'].fillna(''))
        else:
            print(f"Warning: 'last_name' or 'first_name' columns not found in {file}. Using only 'learner_id' for base_id.")
            df['base_id'] = df['learner_id'].fillna('')

        def create_unique_id(group):
            if len(group) == 1:
                return group['base_id']
            else:
                return group['base_id'] + '_' + (group.groupby('base_id').cumcount() + 1).astype(str)

        df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)

        columns_to_remove = ['availability', 'username', 'Username', 'learner_id', 'base_id']
        for col in columns_to_remove:
            if col in df.columns:
                df.drop(col, axis=1, inplace=True)
            else:
                print(f"Warning: Column '{col}' not found in {file}. Skipping removal.")

        # Round numeric columns to 2 decimal places
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        df[numeric_columns] = df[numeric_columns].round(2)

        # Convert 'pass' to 1 and 'fail' to 0
        df = df.replace({'Pass': 1, 'Fail': 0})

        # Apply grading scale conversion
        for column in df.columns:
            df[column] = df[column].apply(lambda x: convert_grade(x, grade_map))

        columns = df.columns.tolist()
        columns.remove('unique_id')
        columns = ['unique_id'] + columns
        df = df[columns]

        output_file = os.path.join(output_path, f"cleaned_{os.path.basename(file)}")
        df.to_csv(output_file, index=False)
        print(f"Cleaned and processed data saved to: {output_file}")

    print("Finished processing all files.")

# Usage
input_path = 'combined_data'
output_path = 'cleaned_data'
grading_scale_path = 'grading_scale.csv'
clean_and_process_data(input_path, output_path, grading_scale_path)

Processing file: combined_data\C100_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_C100_combined.csv
Processing file: combined_data\C200_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_C200_combined.csv
Processing file: combined_data\C300_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_C300_combined.csv
Processing file: combined_data\C400_combined.csv


  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)


Cleaned and processed data saved to: cleaned_data\cleaned_C400_combined.csv
Processing file: combined_data\C500_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_C500_combined.csv
Processing file: combined_data\F100_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_F100_combined.csv
Processing file: combined_data\H100_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_H100_combined.csv
Processing file: combined_data\H400_combined.csv


  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)


Cleaned and processed data saved to: cleaned_data\cleaned_H400_combined.csv
Processing file: combined_data\L100_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_L100_combined.csv
Processing file: combined_data\L400_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_L400_combined.csv
Processing file: combined_data\M000_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_M000_combined.csv
Processing file: combined_data\M100_combined.csv


  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)


Cleaned and processed data saved to: cleaned_data\cleaned_M100_combined.csv
Processing file: combined_data\M200_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_M200_combined.csv
Processing file: combined_data\M300_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_M300_combined.csv
Processing file: combined_data\M400_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_M400_combined.csv
Processing file: combined_data\S100_combined.csv
Cleaned and processed data saved to: cleaned_data\cleaned_S100_combined.csv
Processing file: combined_data\X100_combined.csv
Error: 'learner_id' column not found in combined_data\X100_combined.csv.
Available columns: ['Last Name', 'First Name', 'Learner ID', 'Username', 'Last Access', 'Availability', 'X102 Board Learner Acknowledgement of Recording Privacy Act Satement [Total Pts: 100 Complete/Incomplete] |402665', 'X100A Oral Board [Total Pts: 100 Score] |402662', 'X100B Online Exam [Total Pts:

  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
  df['unique_id'] = df.groupby('base_id', group_keys=False).apply(create_unique_id)
