In [None]:
import pandas as pd
import numpy as np
import os
import janitor
import matplotlib.pyplot as plt
import seaborn as sns

### Define the Processing Function - run this once
#### Cleaning the data from multiple csv files

In [None]:
def process_files(file_paths, output_dir='cleaned_data'):
    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    
    dataframes = []
    
    # Read the CSV files
    for file_path in file_paths:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            # Remove columns with names starting with "Unnamed"
            df = df.loc[:, ~df.columns.str.contains('^unnamed')]
            dataframes.append((os.path.basename(file_path), df))
        else:
            print(f"File not found: {file_path}")

    # Remove empty columns
    dataframes = [(name, df.dropna(axis=1, how='all')) for name, df in dataframes]

    # Correct data types
    dataframes = [(name, df.convert_dtypes()) for name, df in dataframes]

    # Replace blank cells with NaN
    dataframes = [(name, df.replace(r'^\s*$', np.nan, regex=True)) for name, df in dataframes]

    # Rename columns to lowercase and replace spaces with underscores
    for i, (name, df) in enumerate(dataframes):
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        dataframes[i] = (name, df)

    # Split learner_id into team and section
    for i, (name, df) in enumerate(dataframes):
        if 'learner_id' in df.columns:
            # Extract team and section using the correct pattern
            extracted = df['learner_id'].str.extract(r'(\d+)([A-Z])')
            df.loc[:, 'team'] = extracted[0].astype(str).str.zfill(2)
            df.loc[:, 'section'] = extracted[1]
        dataframes[i] = (name, df)

    # Remove the username column if it exists
    for i, (name, df) in enumerate(dataframes):
        if 'username' in df.columns:
            df = df.drop(columns=['username'])
        dataframes[i] = (name, df)

    # Generate a unique identifier
    for i, (name, df) in enumerate(dataframes):
        if 'last_name' in df.columns and 'first_name' in df.columns and 'team' in df.columns and 'section' in df.columns:
            # Fill missing section values with a placeholder
            df['section'] = df['section'].fillna('0')
            df['identifier'] = df['last_name'].str[0] + df['first_name'].str[0] + df['team'] + df['section']
            
            # Handle potential conflicts
            counts = df['identifier'].value_counts()
            conflicts = counts[counts > 1].index

            for conflict in conflicts:
                conflict_indices = df[df['identifier'] == conflict].index
                for j, index in enumerate(conflict_indices):
                    df.loc[index, 'identifier'] = f"{conflict}{j+1}"

            # Move the identifier, team, and section to the leftmost columns
            cols = df.columns.tolist()
            new_order = ['identifier', 'team', 'section'] + [col for col in cols if col not in ['identifier', 'team', 'section']]
            df = df[new_order]

        dataframes[i] = (name, df)

    # Save the cleaned dataframes to the output directory
    for name, df in dataframes:
        output_path = os.path.join(output_dir, name)
        df.to_csv(output_path, index=False)

    return dataframes


### Cell 3: Define the Descriptive Analytics and Visualization Function (Run this once)

In [None]:
def descriptive_analytics_and_visualization(dataframes, columns_dict, output_dir='images'):
    for name, df in dataframes:
        if name in columns_dict:
            columns = columns_dict[name]
            print(f"Descriptive Analytics for {name}:\n")
            print(df[columns].describe(include='all'))
            print("\n")
            
            # Create the output directory if it does not exist
            image_dir = os.path.join(output_dir, name.split('.')[0])
            os.makedirs(image_dir, exist_ok=True)
            
            # Visualizations
            plt.figure(figsize=(15, 10))
            
            # Distribution of Total Scores
            if 'total' in columns:
                plt.subplot(2, 2, 1)
                sns.histplot(df['total'], kde=True)
                plt.title('Distribution of Total Scores')
                plt.xlabel('Total Scores')
                plt.ylabel('Frequency')
                plt.savefig(os.path.join(image_dir, 'distribution_of_total_scores.png'))
                plt.clf()
            
            # Final Grade Distribution
            if 'c100f_-_final_grade_(pass/fail)' in columns:
                plt.subplot(2, 2, 2)
                sns.countplot(x='c100f_-_final_grade_(pass/fail)', data=df)
                plt.title('Final Grade Distribution (Pass/Fail)')
                plt.xlabel('Final Grade (Pass/Fail)')
                plt.ylabel('Count')
                plt.savefig(os.path.join(image_dir, 'final_grade_distribution.png'))
                plt.clf()
            
            # Argumentative Essay Scores Distribution
            if 'c171a1_argumentative_essay' in columns:
                plt.subplot(2, 2, 3)
                sns.histplot(df['c171a1_argumentative_essay'], kde=True)
                plt.title('Distribution of Argumentative Essay Scores')
                plt.xlabel('Argumentative Essay Scores')
                plt.ylabel('Frequency')
                plt.savefig(os.path.join(image_dir, 'argumentative_essay_scores_distribution.png'))
                plt.clf()
            
            # Info Brief Scores Distribution
            if 'c172a1_info_brief_' in columns:
                plt.subplot(2, 2, 4)
                sns.histplot(df['c172a1_info_brief_'], kde=True)
                plt.title('Distribution of Info Brief Scores')
                plt.xlabel('Info Brief Scores')
                plt.ylabel('Frequency')
                plt.savefig(os.path.join(image_dir, 'info_brief_scores_distribution.png'))
                plt.clf()


### Cell 4: Process and Analyze Each File

#### C100

In [None]:
file_paths = ['cleaned_data/C100.csv']
processed_dataframes = process_files(file_paths)

columns_dict = {
    'C100.csv': ['total', 'c100f_-_final_grade_(pass/fail)', 'c100a1_idp', 'c171a1_argumentative_essay', 'c172a1_info_brief_', 'c400b1_diagnostic_exam_', 'f100b1_pretest_(u.s.)_', 'f100b1_pretest_(ims)', 's100b1_pretest_']
}

descriptive_analytics_and_visualization(processed_dataframes, columns_dict)


#### C200

In [None]:
file_paths = ['cleaned_data/C200.csv']
processed_dataframes = process_files(file_paths)

columns_dict = {
    'C200.csv': ['total', 'c100f_-_final_grade_(pass/fail)', 'c100a1_idp', 'c171a1_argumentative_essay', 'c172a1_info_brief_', 'c400b1_diagnostic_exam_', 'f100b1_pretest_(u.s.)_', 'f100b1_pretest_(ims)', 's100b1_pretest_']
}

descriptive_analytics_and_visualization(processed_dataframes, columns_dict)
