In [21]:
import pandas as pd
import numpy as np
import os
import janitor

In [22]:
def process_files(file_paths, output_dir='cleaned_data'):
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    dataframes = []
    
    # Read the CSV files
    for file_path in file_paths:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            # Remove columns with names starting with "Unnamed"
            df = df.loc[:, ~df.columns.str.contains('^unnamed')]
            dataframes.append(df)
        else:
            print(f"File not found: {file_path}")

    # Remove empty columns
    dataframes = [df.dropna(axis=1, how='all') for df in dataframes]

    # Correct data types
    dataframes = [df.convert_dtypes() for df in dataframes]

    # Replace blank cells with NaN
    dataframes = [df.replace(r'^\s*$', np.nan, regex=True) for df in dataframes]

    # Rename columns to lowercase and replace spaces with underscores
    for i, df in enumerate(dataframes):
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        dataframes[i] = df

    # Split learner_id into team and section
    for i, df in enumerate(dataframes):
        if 'learner_id' in df.columns:
            # Extract team and section using the correct pattern
            extracted = df['learner_id'].str.extract(r'(\d+)([A-Z])')
            df.loc[:, 'team'] = extracted[0].astype(str).str.zfill(2)
            df.loc[:, 'section'] = extracted[1]
            print(f"Team and Section for file {file_paths[i]}:\n", df[['team', 'section']].head())  # Debugging line
        dataframes[i] = df

    # Remove the username column if it exists
    for i, df in enumerate(dataframes):
        if 'username' in df.columns:
            df = df.drop(columns=['username'])
        dataframes[i] = df

    # Generate a unique identifier
    for i, df in enumerate(dataframes):
        if 'last_name' in df.columns and 'first_name' in df.columns and 'team' in df.columns and 'section' in df.columns:
            # Fill missing section values with a placeholder
            df['section'] = df['section'].fillna('0')
            df['identifier'] = df['last_name'].str[0] + df['first_name'].str[0] + df['team'] + df['section']
            
            # Handle potential conflicts
            counts = df['identifier'].value_counts()
            conflicts = counts[counts > 1].index

            for conflict in conflicts:
                conflict_indices = df[df['identifier'] == conflict].index
                for j, index in enumerate(conflict_indices):
                    df.loc[index, 'identifier'] = f"{conflict}{j+1}"

            # Move the identifier, team, and section to the leftmost columns
            cols = df.columns.tolist()
            new_order = ['identifier', 'team', 'section'] + [col for col in cols if col not in ['identifier', 'team', 'section']]
            df = df[new_order]

        dataframes[i] = df

    # Save the cleaned dataframes to the output directory
    for i, df in enumerate(dataframes):
        filename = os.path.basename(file_paths[i])
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path, index=False)

    return dataframes


In [23]:
file_paths = [
    'data/C100.csv',
    'data/C200.csv',
    'data/C300.csv',
    'data/C400.csv',
    'data/C500.csv',
    'data/F100.csv',
    'data/H100.csv',
    'data/L100.csv',
    'data/M000.csv',
    'data/M100.csv',
    'data/M200.csv',
    'data/M300.csv',
    'data/M400.csv',
    'data/S100.csv'
]


In [24]:
processed_dataframes = process_files(file_paths)


Team and Section for file data/C100.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/C200.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/C300.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/C400.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/C500.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/F100.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/H100.csv:
   team section
0   01       A
1   01       A
2   01       A
3   01       A
4   01       A
Team and Section for file data/L100.csv:
   team section
0   01       A
1   

In [25]:
# Display the first few rows of the first processed DataFrame to check the results
processed_dataframes[0].head()

Unnamed: 0,identifier,team,section,last_name,first_name,learner_id,last_access,availability,total,c100f_-_final_grade_(pass/fail),c100a1_idp,c171a1_argumentative_essay,c172a1_info_brief_,c400b1_diagnostic_exam_,f100b1_pretest_(u.s.)_,f100b1_pretest_(ims),s100b1_pretest_
0,AJ01A,1,A,ALLEN,JUSTIN,01A,4/28/2024 18:40,Yes,463.84315,Pass,Pass,Pass,Pass,80,Fail,,Fail
1,AK01A,1,A,ALLISON,KEVIN,01A,1/11/2024 13:42,Yes,453.84315,Pass,Pass,Pass,Pass,58,Fail,,Fail
2,BA01A,1,A,BARRETT,ADAM,01A,4/5/2024 9:42,Yes,443.69936,Pass,Pass,Pass,Pass,62,Fail,,Fail
3,CD01A,1,A,CAVERLY,DAVID,01A,5/7/2024 20:50,Yes,461.50694,Pass,Pass,Pass,Pass,76,Fail,,Fail
4,DS01A,1,A,DEARBORN,STEVEN,01A,5/9/2024 8:54,Yes,504.26799,Pass,Pass,Pass,Pass,84,Fail,,Fail
