In [7]:
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 


In [8]:
import pandas as pd

# List of CSV files
csv_files = [
    'data/C200/c200_SEC01.csv', 'data/C200/c200_SEC02.csv', 'data/C200/c200_SEC04.csv',
    'data/C200/c200_SEC05.csv', 'data/C200/c200_SEC06.csv', 'data/C200/c200_SEC07.csv',
    'data/C200/c200_SEC08.csv', 'data/C200/c200_SEC09.csv', 'data/C200/c200_SEC10.csv',
    'data/C200/c200_SEC11.csv', 'data/C200/c200_SEC12.csv', 'data/C200/c200_SEC13.csv',
    'data/C200/c200_SEC14.csv', 'data/C200/c200_SEC15.csv', 'data/C200/c200_SEC16.csv',
    'data/C200/c200_SEC17.csv', 'data/C200/c200_SEC18.csv', 'data/C200/c200_SEC19.csv'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Read the first CSV file to get the correct column headers
initial_df = pd.read_csv(csv_files[0])
correct_columns = initial_df.columns

# Process the first file
dataframes.append(initial_df)
print(f"Read {csv_files[0]} successfully with {len(initial_df)} rows.")


Read data/C200/c200_SEC01.csv successfully with 59 rows.


In [9]:
# Loop through the remaining files
for file in csv_files[1:]:
    try:
        # Read the current CSV file into a DataFrame
        df = pd.read_csv(file)
        
        # Rename columns to match the correct columns
        df.columns = correct_columns
        
        print(f"Read {file} successfully with {len(df)} rows.")
        
        # Drop rows where all cells are blank
        df.dropna(how='all', inplace=True)
        
        # Append the DataFrame to the list
        dataframes.append(df)
        
    except Exception as e:
        print(f"Failed to read {file}: {e}")

Read data/C200/c200_SEC02.csv successfully with 59 rows.
Read data/C200/c200_SEC04.csv successfully with 59 rows.
Read data/C200/c200_SEC05.csv successfully with 59 rows.
Read data/C200/c200_SEC06.csv successfully with 60 rows.
Read data/C200/c200_SEC07.csv successfully with 58 rows.
Read data/C200/c200_SEC08.csv successfully with 59 rows.
Read data/C200/c200_SEC09.csv successfully with 59 rows.
Read data/C200/c200_SEC10.csv successfully with 58 rows.
Read data/C200/c200_SEC11.csv successfully with 59 rows.
Read data/C200/c200_SEC12.csv successfully with 58 rows.
Read data/C200/c200_SEC13.csv successfully with 59 rows.
Read data/C200/c200_SEC14.csv successfully with 59 rows.
Read data/C200/c200_SEC15.csv successfully with 60 rows.
Read data/C200/c200_SEC16.csv successfully with 59 rows.
Read data/C200/c200_SEC17.csv successfully with 59 rows.
Read data/C200/c200_SEC18.csv successfully with 59 rows.
Read data/C200/c200_SEC19.csv successfully with 59 rows.


In [10]:
if dataframes:
    # Concatenate all DataFrames in the list into a single DataFrame
    c200_df = pd.concat(dataframes, ignore_index=True)
    
    # Process the learner_id column
    c200_df['team'] = c200_df['learner_id'].str[:2].astype(int)
    c200_df['section'] = c200_df['learner_id'].str[-1]
    
    # Create a base for the unique identifier
    c200_df['base_id'] = (c200_df['last_name'].str[0] + 
                          c200_df['first_name'].str[0] + 
                          c200_df['learner_id'])
    
    # Function to create a truly unique identifier
    def create_unique_id(group):
        if len(group) == 1:
            return group['base_id']
        else:
            return group['base_id'] + '_' + (group.groupby('base_id').cumcount() + 1).astype(str)
    
    # Apply the function to create unique identifiers
    c200_df['unique_id'] = c200_df.groupby('base_id', group_keys=False).apply(create_unique_id)
    
    # Reorder columns to move unique_id to the leftmost position
    columns = c200_df.columns.tolist()
    columns.remove('unique_id')
    columns = ['unique_id'] + columns
    c200_df = c200_df[columns]
    
    # Save the compiled DataFrame
    c200_df.to_csv('cleaned_data/c200_compiled_dataframe.csv', index=False)
    print("Data compiled and saved successfully.")
    print(f"Total rows in compiled dataframe: {len(c200_df)}")
    
    # Print summary statistics
    print("\nSummary of rows per section:")
    print(c200_df['section'].value_counts().sort_index())
    
    # Display the first few rows to verify the new column order
    print("\nSample data with reordered columns:")
    print(c200_df.head(10))
else:
    print("No dataframes to concatenate.")  

Data compiled and saved successfully.
Total rows in compiled dataframe: 1061

Summary of rows per section:
section
A    281
B    259
C    264
D    257
Name: count, dtype: int64

Sample data with reordered columns:
  unique_id     last_name                first_name               username  \
0     AM01C        ACOSTA                  MITCHELL         mitch.f.acosta   
1     AJ01A         ALLEN                    JUSTIN       justin.lee.allen   
2     AK01A       ALLISON                     KEVIN        kevin.e.allison   
3     AA01D     ALSUWAIDI  AHMED SAIF ABDULLA AHMED   alsuwaidi@hotmail.fr   
4     AA01C      ANDERSON                      ALEX        alex.r.anderson   
5     AA01B      ANDERSON                   ANTHONY    anthony.s.anderson2   
6     AR01D      ANDERSON                    ROBERT  robert.edwin.anderson   
7   AS01C_1      ANDERSON                    STACEY      stacey.c.anderson   
8     AS01B      ANDERSON                   STEPHEN      stephen.anderson8   
9     

  c200_df['unique_id'] = c200_df.groupby('base_id', group_keys=False).apply(create_unique_id)
