In [17]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 
from glob import glob   


In [18]:
# Create a dataframe with the grading information
data = {
    "Letter Grade": ["A+", "A", "A-", "B+", "B", "C+", "C", "U"],
    "4-Point Equivalence": [4.30, 4.00, 3.67, 3.33, 3.00, 2.33, 2.00, 0.00],
    "Grading Range": ["97 - 100", "94 - 96.99", "90 - 93.99", "87 - 89.99", "80 - 86.99", "77 - 79.99", "70 - 76.99", "<70"]
}

df = pd.DataFrame(data)

# Export the dataframe to a CSV file
csv_file_path = "grading_scale.csv"
df.to_csv(csv_file_path, index=False)

csv_file_path

def load_grading_scale(file_path):
    grading_df = pd.read_csv(file_path)
    grade_map = dict(zip(grading_df['Letter Grade'], grading_df['4-Point Equivalence']))
    return grade_map

def convert_grade(value, grade_map, column_name):
    columns_to_ignore = ['section', 'team', 'unique_id']
    if column_name.lower() in columns_to_ignore:
        return value
    
    if isinstance(value, str):
        return grade_map.get(value.upper(), value)
    return value

In [19]:
import os
import pandas as pd
import numpy as np
from glob import glob

def load_grading_scale(file_path):
    grading_df = pd.read_csv(file_path)
    grade_map = dict(zip(grading_df['Letter Grade'], grading_df['4-Point Equivalence']))
    return grade_map

def convert_grade(value, grade_map, column_name):
    columns_to_ignore = ['section', 'team', 'unique_id']
    if column_name.lower() in columns_to_ignore:
        return value
    
    if isinstance(value, str):
        return grade_map.get(value.upper(), value)
    return value

def process_csv_files(base_path, output_path, grading_scale_path):
    os.makedirs(output_path, exist_ok=True)
    grade_map = load_grading_scale(grading_scale_path)
    
    subdirs = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    
    for subdir in subdirs:
        print(f"Processing subdirectory: {subdir}")
        csv_files = sorted(glob(os.path.join(base_path, subdir, '*.csv')))
        
        if not csv_files:
            print(f"No CSV files found in {os.path.join(base_path, subdir)}")
            continue

        # Read the first file to get the correct column names
        first_file = csv_files[0]
        first_df = pd.read_csv(first_file)
        correct_columns = first_df.columns.tolist()
        num_columns = len(correct_columns)

        subdir_dataframes = [first_df]  # Include the first dataframe with headers

        for file in csv_files[1:]:  # Start from the second file
            print(f"Processing file: {file}")
            
            # Read the CSV file without headers, skipping the first row
            df = pd.read_csv(file, header=None, skiprows=[0])
            
            # Check if the number of columns matches
            if len(df.columns) != num_columns:
                print(f"Warning: Column count mismatch in {file}. Adjusting...")
                if len(df.columns) > num_columns:
                    # If there are extra columns, drop them
                    df = df.iloc[:, :num_columns]
                else:
                    # If there are missing columns, add them with NaN values
                    for i in range(len(df.columns), num_columns):
                        df[i] = np.nan
            
            # Assign the correct column names
            df.columns = correct_columns

            subdir_dataframes.append(df)

        # Combine all dataframes for this subdirectory
        combined_df = pd.concat(subdir_dataframes, ignore_index=True)

        combined_df.dropna(how='all', inplace=True)

        if 'learner_id' not in combined_df.columns:
            print(f"Error: 'learner_id' column not found in {subdir}.")
            print("Available columns:", combined_df.columns.tolist())
            continue

        combined_df['team'] = pd.to_numeric(combined_df['learner_id'].str[:2], errors='coerce').astype('Int64')
        combined_df['section'] = combined_df['learner_id'].str[-1]

        if 'last_name' in combined_df.columns and 'first_name' in combined_df.columns:
            combined_df['base_id'] = (combined_df['last_name'].str[0].fillna('') + 
                                      combined_df['first_name'].str[0].fillna('') + 
                                      combined_df['learner_id'].fillna(''))
        else:
            print(f"Warning: 'last_name' or 'first_name' columns not found in {subdir}. Using only 'learner_id' for base_id.")
            combined_df['base_id'] = combined_df['learner_id'].fillna('')

        # Create unique_id without using apply
        combined_df['unique_id'] = combined_df.groupby('base_id').cumcount()
        combined_df['unique_id'] = combined_df.apply(lambda row: f"{row['base_id']}_{row['unique_id'] + 1}" if row['unique_id'] > 0 else row['base_id'], axis=1)

        # Case-insensitive column removal
        columns_to_remove = ['availability', 'username', 'learner_id', 'base_id', 'last_access']
        columns_to_remove_lower = [col.lower() for col in columns_to_remove]

        columns_removed = []
        for col in combined_df.columns:
            if col.lower() in columns_to_remove_lower:
                combined_df.drop(col, axis=1, inplace=True)
                columns_removed.append(col)

        if columns_removed:
            print(f"Removed columns: {', '.join(columns_removed)}")
        else:
            print("No columns were removed.")

        # Round numeric columns to 2 decimal places
        numeric_columns = combined_df.select_dtypes(include=[np.number]).columns
        combined_df[numeric_columns] = combined_df[numeric_columns].round(2)

        # Convert 'pass' to 1 and 'fail' to 0
        combined_df = combined_df.replace({'Pass': 1, 'Fail': 0})

        # Apply grading scale conversion, ignoring specified columns
        for column in combined_df.columns:
            combined_df[column] = combined_df[column].apply(lambda x: convert_grade(x, grade_map, column))

        columns = combined_df.columns.tolist()
        columns.remove('unique_id')
        columns = ['unique_id'] + columns
        combined_df = combined_df[columns]

        # Save the combined dataframe for this subdirectory
        output_file = os.path.join(output_path, f"{subdir}_combined.csv")
        combined_df.to_csv(output_file, index=False)
        print(f"Combined and processed data for {subdir} saved to: {output_file}")

    print("Finished processing all subdirectories.")

# Usage
base_path = 'data'
output_path = 'combined_data'
grading_scale_path = 'grading_scale.csv'  # Make sure this file exists
process_csv_files(base_path, output_path, grading_scale_path)

Processing subdirectory: C100
Processing file: data\C100\C100_SEC02.csv
Processing file: data\C100\C100_SEC04.csv
Processing file: data\C100\C100_SEC05.csv
Processing file: data\C100\C100_SEC06.csv
Processing file: data\C100\C100_SEC07.csv
Processing file: data\C100\C100_SEC08.csv
Processing file: data\C100\C100_SEC09.csv
Processing file: data\C100\C100_SEC10.csv
Processing file: data\C100\C100_SEC11.csv
Processing file: data\C100\C100_SEC12.csv
Processing file: data\C100\C100_SEC13.csv
Processing file: data\C100\C100_SEC14.csv
Processing file: data\C100\C100_SEC15.csv
Processing file: data\C100\C100_SEC16.csv
Processing file: data\C100\C100_SEC17.csv
Processing file: data\C100\C100_SEC18.csv
Processing file: data\C100\C100_SEC19.csv
Removed columns: Username, learner_id, last_access, base_id
Combined and processed data for C100 saved to: combined_data\C100_combined.csv
Processing subdirectory: C200
Processing file: data\C200\C200_SEC02.csv
Processing file: data\C200\C200_SEC04.csv
Pro

  combined_df = combined_df.replace({'Pass': 1, 'Fail': 0})


Combined and processed data for C400 saved to: combined_data\C400_combined.csv
Processing subdirectory: C500
Processing file: data\C500\C500_SEC02.csv
Processing file: data\C500\C500_SEC04.csv
Processing file: data\C500\C500_SEC05.csv
Processing file: data\C500\C500_SEC06.csv
Processing file: data\C500\C500_SEC07.csv
Processing file: data\C500\C500_SEC08.csv
Processing file: data\C500\C500_SEC09.csv
Processing file: data\C500\C500_SEC10.csv
Processing file: data\C500\C500_SEC11.csv
Processing file: data\C500\C500_SEC12.csv
Processing file: data\C500\C500_SEC13.csv
Processing file: data\C500\C500_SEC14.csv
Processing file: data\C500\C500_SEC15.csv
Processing file: data\C500\C500_SEC16.csv
Processing file: data\C500\C500_SEC17.csv
Processing file: data\C500\C500_SEC18.csv
Processing file: data\C500\C500_SEC19.csv
Removed columns: username, learner_id, last_access, availability, base_id
Combined and processed data for C500 saved to: combined_data\C500_combined.csv
Processing subdirectory:

  combined_df = combined_df.replace({'Pass': 1, 'Fail': 0})
  combined_df = combined_df.replace({'Pass': 1, 'Fail': 0})


Processing file: data\M000\M000_SEC08.csv
Processing file: data\M000\M000_SEC09.csv
Processing file: data\M000\M000_SEC10.csv
Processing file: data\M000\M000_SEC11.csv
Processing file: data\M000\M000_SEC13.csv
Processing file: data\M000\M000_SEC14.csv
Processing file: data\M000\M000_SEC15.csv
Processing file: data\M000\M000_SEC16.csv
Processing file: data\M000\M000_SEC17.csv
Processing file: data\M000\M000_SEC18.csv
Processing file: data\M000\M000_SEC19.csv
Removed columns: learner_id, username, last_access, availability, base_id
Combined and processed data for M000 saved to: combined_data\M000_combined.csv
Processing subdirectory: M100
Processing file: data\M100\M100_SEC02.csv
Processing file: data\M100\M100_SEC04.csv
Processing file: data\M100\M100_SEC05.csv
Processing file: data\M100\M100_SEC06.csv
Processing file: data\M100\M100_SEC07.csv
Processing file: data\M100\M100_SEC08.csv
Processing file: data\M100\M100_SEC09.csv
Processing file: data\M100\M100_SEC10.csv
Processing file: da