In [17]:
import pandas as pd
import numpy as np
import os
import janitor
import matplotlib.pyplot as plt
import seaborn as sns
import logging

In [18]:
# List of file paths
file_paths = [
    "data/C100.csv",
    "data/C200.csv",
    "data/C300.csv",
    "data/C400.csv",
    "data/C500.csv",
    "data/F100.csv",
    "data/H100.csv",
    "data/S100.csv",
    "data/M000.csv",
    "data/M100.csv",
    "data/M200.csv",
    "data/M300.csv",
    "data/M400.csv",
    'data/X102.csv'
]

In [19]:
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

### Define the Processing Function - run this once
#### Cleaning the data from multiple csv files

In [20]:
def process_files(file_paths, output_dir='cleaned_data', key_columns=None, drop_columns=None):
    if key_columns is None:
        key_columns = ['last_name', 'first_name', 'username']
    if drop_columns is None:
        drop_columns = ['username', 'availability']  # Including 'availability'

    
    dataframes = []
    
    # Read the CSV files
    for file_path in file_paths:
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            # Remove columns with names starting with "Unnamed"
            df = df.loc[:, ~df.columns.str.contains('^unnamed','^re-test')]
            dataframes.append((os.path.basename(file_path), df))
        else:
            print(f"File not found: {file_path}")

        # Drop duplicates based on key identifying columns, keep first occurrence
            df = df.drop_duplicates(subset=key_columns, keep='first')

    # Remove empty columns
    dataframes = [(name, df.dropna(axis=1, how='all')) for name, df in dataframes]

    # Correct data types
    dataframes = [(name, df.convert_dtypes()) for name, df in dataframes]

    # Replace blank cells with NaN
    dataframes = [(name, df.replace(r'^\s*$', np.nan, regex=True)) for name, df in dataframes]

    # Rename columns to lowercase and replace spaces with underscores
    for i, (name, df) in enumerate(dataframes):
        df.columns = df.columns.str.lower().str.replace(' ', '_')
        dataframes[i] = (name, df)

    # Split learner_id into team and section
    for i, (name, df) in enumerate(dataframes):
        if 'learner_id' in df.columns:
            # Extract team and section using the correct pattern
            extracted = df['learner_id'].str.extract(r'(\d+)([A-Z])')
            df.loc[:, 'team'] = extracted[0].astype(str).str.zfill(2)
            df.loc[:, 'section'] = extracted[1]
        dataframes[i] = (name, df)

    # Remove the username column if it exists
    for i, (name, df) in enumerate(dataframes):
        if 'username' in df.columns:
            df = df.drop(columns=['username'])
        dataframes[i] = (name, df)

    # Generate a unique identifier
    for i, (name, df) in enumerate(dataframes):
        if 'last_name' in df.columns and 'first_name' in df.columns and 'team' in df.columns and 'section' in df.columns:
            # Fill missing section values with a placeholder
            df['section'] = df['section'].fillna('0')
            df['identifier'] = df['last_name'].str[0] + df['first_name'].str[0] + df['team'] + df['section']
            
            # Handle potential conflicts
            counts = df['identifier'].value_counts()
            conflicts = counts[counts > 1].index

            for conflict in conflicts:
                conflict_indices = df[df['identifier'] == conflict].index
                for j, index in enumerate(conflict_indices):
                    df.loc[index, 'identifier'] = f"{conflict}{j+1}"

            # Move the identifier, team, and section to the leftmost columns
            cols = df.columns.tolist()
            new_order = ['identifier', 'team', 'section'] + [col for col in cols if col not in ['identifier', 'team', 'section']]
            df = df[new_order]

        dataframes[i] = (name, df)

    # Save the cleaned dataframes to the output directory
    for name, df in dataframes:
        output_path = os.path.join(output_dir, name)
        df.to_csv(output_path, index=False)

    return dataframes

In [21]:
# Execute the process_files function with the file_paths argument
processed_dataframes = process_files(file_paths)

# Optional: Print out the processed dataframes or perform further analysis
for name, df in processed_dataframes:
    print(f"Processed dataframe: {name}")
    print(df.head())  # Example: Display the first few rows of each processed dataframe
    print("\n")

Processed dataframe: C100.csv
  identifier team section last_name first_name learner_id      last_access  \
0      AJ01A   01       A     ALLEN     JUSTIN        01A  4/28/2024 18:40   
1      AK01A   01       A   ALLISON      KEVIN        01A  1/11/2024 13:42   
2      BA01A   01       A   BARRETT       ADAM        01A    4/5/2024 9:42   
3      CD01A   01       A   CAVERLY      DAVID        01A   5/7/2024 20:50   
4      DS01A   01       A  DEARBORN     STEVEN        01A    5/9/2024 8:54   

  availability      total final_grade   idp argumentative_essay info_brief_  \
0          Yes  463.84315        Pass  Pass                Pass        Pass   
1          Yes  453.84315        Pass  Pass                Pass        Pass   
2          Yes  443.69936        Pass  Pass                Pass        Pass   
3          Yes  461.50694        Pass  Pass                Pass        Pass   
4          Yes  504.26799        Pass  Pass                Pass        Pass   

   diagnostic_exam_ pretes

In [22]:
# List of file paths
files = [
    "cleaned_data/C100.csv",
    "cleaned_data/C200.csv",
    "cleaned_data/C300.csv",
    "cleaned_data/C400.csv",
    "cleaned_data/C500.csv",
    "cleaned_data/F100.csv",
    "cleaned_data/H100.csv",
    "cleaned_data/S100.csv",
    "cleaned_data/M000.csv",
    "cleaned_data/M100.csv",
    "cleaned_data/M200.csv",
    "cleaned_data/M300.csv",
    "cleaned_data/M400.csv"
]

# Dictionaries to hold DataFrames and column headers
dataframes = {}
column_headers = {}

# Load each file into a DataFrame, store it, and extract column headers
for file_path in files:
    key = file_path.split('/')[-1].replace('.csv', '')
    df = pd.read_csv(file_path)
    dataframes[key] = df
    column_headers[key] = df.columns.tolist()

# Print column headers for each file
for key, headers in column_headers.items():
    print(f"Column headers for {key}:")
    print(headers)
    print("\n")


Column headers for C100:
['identifier', 'team', 'section', 'last_name', 'first_name', 'learner_id', 'last_access', 'availability', 'total', 'final_grade', 'idp', 'argumentative_essay', 'info_brief_', 'diagnostic_exam_', 'pretest_(u.s.)_', 'pretest_(ims)', 'pretest_']


Column headers for C200:
['identifier', 'team', 'section', 'last_name', 'first_name', 'learner_id', 'last_access', 'availability', 'weighted_total', 'total_', 'c200p1_(c299)_contribution_to_group_learning_', 'c200p2_(c201-c206)_contribution_to_group_learning_', 'c200a1_test_2_essay_', 'c200b1_test_1_us_students_', 'c200b1_test_1_ims_students', 'unnamed:_13']


Column headers for C300:
['identifier', 'team', 'section', 'last_name', 'first_name', 'learner_id', 'last_access', 'availability', 'weighted_total_', 'total', 'c200p1_(c299)_contribution_to_group_learning_', 'c200p2_(c201-c206)_contribution_to_group_learning_', 'c200a1_test_2_essay_[total_pts:_100_score]_|410224', 'c200b1_test_1_us_students_', 'c200b1_test_1_ims_st

### Cell 3: Define the Descriptive Analytics and Visualization Function (Run this once)

In [23]:
def descriptive_analytics_and_visualization(dataframes, columns_dict, output_dir='images'):
    # Ensure necessary directories exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    for name, df in dataframes:
        if name in columns_dict:
            columns = columns_dict[name]
            logging.info(f"Processing descriptive analytics for {name}")

            # Create the output directory for this specific DataFrame's visualizations
            image_dir = os.path.join(output_dir, name.split('.')[0])
            if not os.path.exists(image_dir):
                os.makedirs(image_dir, exist_ok=True)
                logging.info(f"Created image directory: {image_dir}")
            else:
                logging.info(f"Image directory already exists: {image_dir}")

            plt.figure(figsize=(15, 10))


### Cell 4: Process and Analyze Each File

#### C100

In [24]:
file_paths = ['cleaned_data/C100.csv']
processed_dataframes = process_files(file_paths)

columns_dict = {
    'C100.csv': ['total', 
                'c100f_-_final_grade_(pass/fail)',
                'c100a1_idp', 'c171a1_argumentative_essay',
                'c172a1_info_brief_', 'c400b1_diagnostic_exam_',
                'f100b1_pretest_(u.s.)_', 'f100b1_pretest_(ims)', 's100b1_pretest_']
}

descriptive_analytics_and_visualization(processed_dataframes, columns_dict)

TypeError: can only concatenate str (not "int") to str

#### C200

### C300

### C400