In [7]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 
import traceback

In [8]:
def normalize_column_name(col):
    # Convert to lowercase
    col = col.lower()
    
    # Remove any content within square brackets
    col = re.sub(r'\[.*?\]', '', col)
    
    # Replace spaces and hyphens with underscores
    col = re.sub(r'[\s-]', '_', col)
    
    # Remove any non-alphanumeric characters (except underscores)
    col = re.sub(r'[^a-z0-9_]', '', col)
    
    # Remove duplicate underscores
    col = re.sub(r'_+', '_', col)
    
    # Remove leading/trailing underscores
    col = col.strip('_')
    
    # Correct common misspellings
    col = col.replace('availibility', 'availability')
    
    return col

def process_folder(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    if not csv_files:
        print(f"No CSV files found in {folder_path}")
        return None

    dataframes = []
    initial_df = pd.read_csv(os.path.join(folder_path, csv_files[0]))
    correct_columns = normalize_column_names(initial_df.columns)
    initial_df.columns = correct_columns

    dataframes.append(initial_df)
    print(f"Read {csv_files[0]} successfully with {len(initial_df)} rows.")

    for file in csv_files[1:]:
        try:
            df = pd.read_csv(os.path.join(folder_path, file))
            df.columns = normalize_column_names(df.columns)
            
            if set(df.columns) != set(correct_columns):
                print(f"Warning: Columns in {file} do not match the initial file. Attempting to align columns.")
                for col in correct_columns:
                    if col not in df.columns:
                        df[col] = None
                df = df[correct_columns]
            
            print(f"Read {file} successfully with {len(df)} rows.")
            df.dropna(how='all', inplace=True)
            dataframes.append(df)
            
        except Exception as e:
            print(f"Failed to read {file}: {e}")

    if dataframes:
        folder_df = pd.concat(dataframes, ignore_index=True)
        print(f"Columns in the concatenated DataFrame: {folder_df.columns.tolist()}")
        
        learner_id_column = next((col for col in folder_df.columns if 'learner_id' in col.lower()), None)
        if learner_id_column:
            folder_df['learner_id'] = folder_df[learner_id_column]
        else:
            print("Error: No 'learner_id' column found. Using index as learner_id.")
            folder_df['learner_id'] = folder_df.index.astype(str)

        folder_df['team'] = pd.to_numeric(folder_df['learner_id'].str[:2], errors='coerce').astype('Int64')
        folder_df['section'] = folder_df['learner_id'].str[-1]
        
        # Create base_id using first letter of last name, first letter of first name, and learner_id
        folder_df['base_id'] = (folder_df['last_name'].str[0].fillna('') + 
                                folder_df['first_name'].str[0].fillna('') + 
                                folder_df['learner_id'])
        
        def create_unique_id(group):
            if len(group) == 1:
                return group['base_id']
            else:
                return group['base_id'] + '_' + (group.groupby('base_id').cumcount() + 1).astype(str)
        
        folder_df['unique_id'] = folder_df.groupby('base_id', group_keys=False).apply(create_unique_id)
        
        columns = folder_df.columns.tolist()
        columns.remove('unique_id')
        columns = ['unique_id'] + columns
        folder_df = folder_df[columns]
        
        return folder_df
    else:
        print(f"No dataframes to concatenate in {folder_path}.")
        return None

In [9]:
# Main script
data_root = os.path.normpath(r'data/data')  # This should work for both Windows and Unix-like systems
output_dir = 'cleaned_data'
os.makedirs(output_dir, exist_ok=True)

expected_folders = [
    'C100', 'C200', 'C300', 'C400', 'C500', 
    'F100', 'S100', 'H100', 'H400', 'L100', 'L400', 
    'M000', 'M100', 'M200', 'M300', 'M400'
]

data_folders = [f.path for f in os.scandir(data_root) if f.is_dir()]
print(f"Total folders found: {len(data_folders)}")
print("Folders to be processed:")
for folder in data_folders:
    print(f"- {folder}")

processed_folders = 0
processed_folder_names = []

for folder in data_folders:
    folder_name = os.path.basename(folder)
    if folder_name not in expected_folders:
        print(f"\nSkipping unexpected folder: {folder}")
        continue
    
    print(f"\nProcessing folder: {folder}")
    try:
        result_df = process_folder(folder)
        
        if result_df is not None:
            output_file = os.path.join(output_dir, f'{folder_name}_compiled_dataframe.csv')
            result_df.to_csv(output_file, index=False)
            print(f"Data compiled and saved successfully to {output_file}")
            print(f"Total rows in compiled dataframe: {len(result_df)}")
            
            print("\nSummary of rows per section:")
            print(result_df['section'].value_counts().sort_index())
            
            print("\nSample data with reordered columns:")
            print(result_df.head(10))
            
            processed_folders += 1
            processed_folder_names.append(folder_name)
        else:
            print(f"No data processed for {folder}")
    except Exception as e:
        print(f"Error processing folder {folder}:")
        print(traceback.format_exc())

print(f"\nTotal folders processed: {processed_folders}")
print(f"Expected folders: {len(expected_folders)}")
print(f"Missing folders: {len(expected_folders) - processed_folders}")

print("\nProcessed folders:")
for folder in processed_folder_names:
    print(f"- {folder}")

print("\nMissing folders:")
for folder in expected_folders:
    if folder not in processed_folder_names:
        print(f"- {folder}")

print("\nAll available folders processed.")

# Check the contents of the cleaned_data folder
print("\nContents of the cleaned_data folder:")
for file in os.listdir(output_dir):
    print(f"- {file}")

Total folders found: 17
Folders to be processed:
- data/data\C100
- data/data\C200
- data/data\C300
- data/data\C400
- data/data\C500
- data/data\F100
- data/data\H100
- data/data\H400
- data/data\L100
- data/data\L400
- data/data\M000
- data/data\M100
- data/data\M200
- data/data\M300
- data/data\M400
- data/data\S100
- data/data\X100

Processing folder: data/data\C100
Error processing folder data/data\C100:
Traceback (most recent call last):
  File "C:\Users\balla\AppData\Local\Temp\ipykernel_7692\16842997.py", line 29, in <module>
    result_df = process_folder(folder)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\balla\AppData\Local\Temp\ipykernel_7692\4107296010.py", line 34, in process_folder
    correct_columns = normalize_column_names(initial_df.columns)
                      ^^^^^^^^^^^^^^^^^^^^^^
NameError: name 'normalize_column_names' is not defined. Did you mean: 'normalize_column_name'?


Processing folder: data/data\C200
Error processing folder data/data\C200:


In [10]:
print("\n\nReport on files in the cleaned_data folder:")
print("=" * 50)

for file in os.listdir(output_dir):
    if file.endswith('.csv'):
        file_path = os.path.join(output_dir, file)
        try:
            df = pd.read_csv(file_path)
            print(f"\nFile: {file}")
            print(f"Number of rows: {len(df)}")
            print(f"Number of columns: {len(df.columns)}")
            print("Columns:")
            for col in df.columns:
                print(f"  - {col}")
        except Exception as e:
            print(f"\nError reading file {file}:")
            print(traceback.format_exc())

print("\nEnd of report")
print("=" * 50)



Report on files in the cleaned_data folder:

File: c100_compiled_dataframe.csv
Number of rows: 1061
Number of columns: 19
Columns:
  - unique_id
  - last_name
  - first_name
  - username
  - learner_id
  - last_access
  - availibility
  - total
  - c100_final
  - c100__idp
  - c170_essay
  - c172_brief
  - c400_diag_exam
  - f100_pretest_us
  - f100_pretest_ims
  - s100_pretest
  - team
  - section
  - base_id

File: c200_compiled_dataframe.csv
Number of rows: 1061
Number of columns: 17
Columns:
  - unique_id
  - last_name
  - first_name
  - username
  - learner_id
  - last_access
  - availability
  - c200_wt_ttl
  - c200_ttl
  - c200_ctgl_299
  - c200_ctgl_200
  - c200_essay
  - c200_test_us
  - c200_test_ims
  - team
  - section
  - base_id

File: C300_compiled_dataframe.csv
Number of rows: 1062
Number of columns: 16
Columns:
  - unique_id
  - last_name
  - first_name
  - username
  - learner_id
  - last_access
  - availability
  - c300_wt_ttl
  - c300_ttl
  - c300_ctgl
  - c300_te