In [1]:
import pandas as pd
import numpy as np
import os
import janitor
import matplotlib.pyplot as plt
import seaborn as sns
import logging

In [2]:
# Read in multiple CSV files
csv_files_c300 = [
    'bb_data/C300_Analytics/C300_SEC01.csv',
    'bb_data/C300_Analytics/C300_SEC02.csv',
    'bb_data/C300_Analytics/C300_SEC04.csv',
    'bb_data/C300_Analytics/C300_SEC05.csv',
    'bb_data/C300_Analytics/C300_SEC06.csv',
    'bb_data/C300_Analytics/C300_SEC07.csv',
    'bb_data/C300_Analytics/C300_SEC08.csv',
    'bb_data/C300_Analytics/C300_SEC09.csv',
    'bb_data/C300_Analytics/C300_SEC10.csv',
    'bb_data/C300_Analytics/C300_SEC11.csv',
    'bb_data/C300_Analytics/C300_SEC12.csv',
    'bb_data/C300_Analytics/C300_SEC13.csv',
    'bb_data/C300_Analytics/C300_SEC14.csv',
    'bb_data/C300_Analytics/C300_SEC15.csv',
    'bb_data/C300_Analytics/C300_SEC16.csv',
    'bb_data/C300_Analytics/C300_SEC17.csv',
    'bb_data/C300_Analytics/C300_SEC18.csv',
    'bb_data/C300_Analytics/C300_SEC19.csv'
]

csv_files_c500 = [
    'bb_data/C500_Analytics/C500_SEC01.csv',
    'bb_data/C500_Analytics/C500_SEC02.csv',
    'bb_data/C500_Analytics/C500_SEC04.csv',
    'bb_data/C500_Analytics/C500_SEC05.csv',
    'bb_data/C500_Analytics/C500_SEC06.csv',
    'bb_data/C500_Analytics/C500_SEC07.csv',
    'bb_data/C500_Analytics/C500_SEC08.csv',
    'bb_data/C500_Analytics/C500_SEC09.csv',
    'bb_data/C500_Analytics/C500_SEC10.csv',
    'bb_data/C500_Analytics/C500_SEC11.csv',
    'bb_data/C500_Analytics/C500_SEC12.csv',
    'bb_data/C500_Analytics/C500_SEC13.csv',
    'bb_data/C500_Analytics/C500_SEC14.csv',
    'bb_data/C500_Analytics/C500_SEC15.csv',
    'bb_data/C500_Analytics/C500_SEC16.csv',
    'bb_data/C500_Analytics/C500_SEC17.csv',
    'bb_data/C500_Analytics/C500_SEC18.csv',
    'bb_data/C500_Analytics/C500_SEC19.csv'
]

def read_and_concat_csv_files(csv_files, output_file):
    # Initialize an empty list to store DataFrames
    dataframes = []

    # Read the first CSV file to get the column headers
    initial_df = pd.read_csv(csv_files[0])
    initial_columns = initial_df.columns

    # Loop through the list of files
    for file in csv_files:
        # Read the current CSV file into a DataFrame, ensuring it matches the initial columns
        df = pd.read_csv(file, usecols=lambda column: column in initial_columns).reindex(columns=initial_columns)

        # Drop rows where all cells are blank
        df.dropna(how='all', inplace=True)

        # Append the DataFrame to the list
        dataframes.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    compiled_df = pd.concat(dataframes, ignore_index=True)

    # Save the compiled DataFrame
    compiled_df.to_csv(output_file, index=False)

# Read and concatenate C300 and C500 CSV files
read_and_concat_csv_files(csv_files_c300, 'data/C300.csv')
read_and_concat_csv_files(csv_files_c500, 'data/C500.csv')


In [3]:
# List of file paths
file_paths = [
    "data/C100.csv",
    "data/C200.csv",
    "data/C300.csv",
    "data/C400.csv",
    "data/C500.csv",
    "data/F100.csv",
    "data/H100.csv",
    "data/S100.csv",
    "data/M000.csv",
    "data/M100.csv",
    "data/M200.csv",
    "data/M300.csv",
    "data/M400.csv",
    'data/X102.csv'
]

In [4]:
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

### Define the Processing Function - run this once
#### Cleaning the data from multiple csv files

In [5]:
# %%
def process_files(file_paths, output_dir='cleaned_data', key_columns=None, drop_columns=None):
    if key_columns is None:
        key_columns = ['last_name', 'first_name', 'username']
    if drop_columns is None:
        drop_columns = ['username', 'availability']  # Including 'availability'

    dataframes = []

    # Read the CSV files
    for file_path in file_paths:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            # Remove columns with names starting with "Unnamed"
            df = df.loc[:, ~df.columns.str.contains('^unnamed', case=False)]
            
            # Strip whitespace from column names
            df.columns = df.columns.str.strip()

            dataframes.append((os.path.basename(file_path), df))
        else:
            print(f"File not found: {file_path}")

    # Drop duplicates based on key identifying columns, keep first occurrence
    dataframes = [(name, df.drop_duplicates(subset=key_columns, keep='first')) for name, df in dataframes]

    # Remove empty columns
    dataframes = [(name, df.dropna(axis=1, how='all')) for name, df in dataframes]

    # Correct data types
    dataframes = [(name, df.convert_dtypes()) for name, df in dataframes]

    # Replace blank cells with NaN
    dataframes = [(name, df.replace(r'^\s*$', np.nan, regex=True)) for name, df in dataframes]

    # Rename columns to lowercase and replace spaces with underscores
    for i, (name, df) in enumerate(dataframes):
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        dataframes[i] = (name, df)

    # Split learner_id into team and section
    for i, (name, df) in enumerate(dataframes):
        if 'learner_id' in df.columns:
            # Extract team and section using the correct pattern
            extracted = df['learner_id'].str.extract(r'(\d+)([A-Z])')
            df.loc[:, 'team'] = extracted[0].astype(str).str.zfill(2)
            df.loc[:, 'section'] = extracted[1]
        dataframes[i] = (name, df)

    # Remove the username column if it exists
    for i, (name, df) in enumerate(dataframes):
        if 'username' in df.columns:
            df = df.drop(columns=['username'])
        dataframes[i] = (name, df)

    # Generate a unique identifier
    for i, (name, df) in enumerate(dataframes):
        if 'last_name' in df.columns and 'first_name' in df.columns and 'team' in df.columns and 'section' in df.columns:
            # Fill missing section values with a placeholder
            df['section'] = df['section'].fillna('0')
            df['identifier'] = df['last_name'].str[0] + df['first_name'].str[0] + df['team'] + df['section']

            # Handle potential conflicts
            counts = df['identifier'].value_counts()
            conflicts = counts[counts > 1].index

            for conflict in conflicts:
                conflict_indices = df[df['identifier'] == conflict].index
                for j, index in enumerate(conflict_indices):
                    df.loc[index, 'identifier'] = f"{conflict}{j+1}"

            # Move the identifier, team, and section to the leftmost columns
            cols = df.columns.tolist()
            new_order = ['identifier', 'team', 'section'] + [col for col in cols if col not in ['identifier', 'team', 'section']]
            df = df[new_order]

        dataframes[i] = (name, df)

    # Save the cleaned dataframes to the output directory
    for name, df in dataframes:
        output_path = os.path.join(output_dir, name)
        df.to_csv(output_path, index=False)

    return dataframes


In [6]:
# Execute the process_files function with the file_paths argument
processed_dataframes = process_files(file_paths)

# Optional: Print out the processed dataframes or perform further analysis
for name, df in processed_dataframes:
    print(f"Processed dataframe: {name}")
    print(df.head())  # Example: Display the first few rows of each processed dataframe
    

KeyError: Index(['first_name', 'username', 'last_name'], dtype='object')

In [None]:
# List of file paths
files = [
    "cleaned_data/C100.csv",
    "cleaned_data/C200.csv",
    "cleaned_data/C300.csv",
    "cleaned_data/C400.csv",
    "cleaned_data/C500.csv",
    "cleaned_data/F100.csv",
    "cleaned_data/H100.csv",
    "cleaned_data/S100.csv",
    "cleaned_data/M000.csv",
    "cleaned_data/M100.csv",
    "cleaned_data/M200.csv",
    "cleaned_data/M300.csv",
    "cleaned_data/M400.csv"
]

# Create 'compiled_reports' directory if it doesn't exist
if not os.path.exists('compiled_reports'):
    os.makedirs('compiled_reports')

# Dictionaries to hold DataFrames and column headers
dataframes = {}
column_headers = {}

# Load each file into a DataFrame, store it, and extract column headers
for file_path in files:
    key = file_path.split('/')[-1].replace('.csv', '')
    df = pd.read_csv(file_path)
    dataframes[key] = df
    column_headers[key] = df.columns.tolist()

# Print column headers for each file
for key, headers in column_headers.items():
    print(f"Column headers for {key}:")
    print(headers)
    print("\n")


#### Student report ( use the identifier)

### Step 2: Define the list of unique identifiers you want to extract

In [None]:
# Replace these example identifiers with your actual ones
unique_identifiers_to_extract = ['KR09A']

# Initialize an empty list to hold dataframes
dfs = []


### Step 3: Read each CSV file and extract rows with specified unique identifiers

In [None]:
for file in csv_files:
    df = pd.read_csv(file)
    for identifier in unique_identifiers_to_extract:
        # Extract the row(s) with the current identifier if it exists
        if identifier in df['identifier'].values:
            row = df[df['identifier'] == identifier]
            dfs.append(row)


### Step 4: Compile the rows into a single DataFrame

In [None]:
# Compile the rows into a single DataFrame
compiled_df = pd.concat(dfs, ignore_index=True)

### Step 5: Save the compiled report to a new CSV file

In [None]:
compiled_df.to_csv('compiled_reports/KR09A_compiled_report.csv', index=False)

# Display the compiled DataFrame
compiled_df.head()
