### Import Dependencies

In [1]:
import pandas as pd
import os
# Import dependencies
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import KFold, cross_val_score 
import statsmodels.api as sm
import re
import hashlib 

In [None]:
# Define the path to the main 'data' directory
data_directory = "data"

# Define the absolute path to the column mapping CSV file
column_mapping_file = "path/to/column_mapping.csv"  # Update this to the correct path

# List of block directories
block_directories = [
    "C100", "C200", "C300", "C400", "C500",
    "L100", "L400", "H100", "H400",
    "M000", "M100", "M200", "M300", "M400",
    "S100", "F100"
]

In [None]:
# Read the column mapping CSV into a DataFrame
column_mapping_df = pd.read_csv(column_mapping_file)

# Create a dictionary from the DataFrame
column_mapping = dict(zip(column_mapping_df['original'], column_mapping_df['new']))

# Initialize an empty list to store DataFrames
dfs = []

In [None]:
# Function to anonymize names
def anonymize_name(row):
    full_name = row['First Name'] + row['Last Name']
    return hashlib.sha256(full_name.encode()).hexdigest()

In [None]:
# Iterate through each block directory and read CSV files
for block in block_directories:
    block_path = os.path.join(data_directory, block)
    
    # Check if the block directory exists
    if os.path.exists(block_path) and os.path.isdir(block_path):
        for file_name in os.listdir(block_path):
            if file_name.endswith(".csv"):
                file_path = os.path.join(block_path, file_name)
                print(f"Reading file: {file_path}")
                
                try:
                    # Read the CSV file into a DataFrame
                    df = pd.read_csv(file_path)
                    
                    # Append the DataFrame to the list
                    dfs.append(df)
                    
                    # Print the shape and columns of the DataFrame
                    print(f"Shape of the DataFrame: {df.shape}")
                    print(f"Columns in the DataFrame: {df.columns.tolist()}")
                
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")


In [None]:
# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Print the shape and columns of the combined DataFrame
print(f"Shape of the combined DataFrame: {combined_df.shape}")
print(f"Columns in the combined DataFrame: {combined_df.columns.tolist()}")

In [None]:
# Rename columns based on the mapping
combined_df = combined_df.rename(columns=column_mapping)

In [None]:
# Anonymize names, create unique keys, and drop the original name columns
if 'First Name' in combined_df.columns and 'Last Name' in combined_df.columns:
    combined_df['Student Key'] = combined_df.apply(anonymize_name, axis=1)
    combined_df = combined_df.drop(columns=['First Name', 'Last Name'])

In [None]:
# Initialize unique_usernames as None
unique_usernames = None

In [None]:
# Assign unique IDs to each username and drop the username column
if 'username' in combined_df.columns:
    unique_usernames = combined_df['username'].unique()
    username_mapping = {username: idx for idx, username in enumerate(unique_usernames)}
    combined_df['User ID'] = combined_df['username'].map(username_mapping)
    combined_df = combined_df.drop(columns=['username'])

In [None]:
# Split 'Learner ID' into 'Team Number' and 'Staff Group'
if 'Learner ID' in combined_df.columns:
    combined_df['Team Number'] = combined_df['Learner ID'].str.extract(r'(\d+)', expand=False)
    combined_df['Staff Group'] = combined_df['Learner ID'].str.extract(r'([A-D])', expand=False)
    
    # Convert 'Team Number' to numeric and handle NaN values
    combined_df['Team Number'] = pd.to_numeric(combined_df['Team Number'], errors='coerce')
    
    # Optional: Drop rows with NaN values in 'Team Number' or 'Staff Group' or fill NaN values
    combined_df = combined_df.dropna(subset=['Team Number', 'Staff Group'])

In [None]:
# Sort the DataFrame by 'Team Number' and 'Staff Group'
combined_df = combined_df.sort_values(by=['Team Number', 'Staff Group'])

In [None]:
# Save the combined DataFrame to a new CSV file
output_file = os.path.join(data_directory, 'combined_anonymized_data.csv')
combined_df.to_csv(output_file, index=False)

# Print the path to the CSV file for review
print(f"The combined and anonymized DataFrame has been saved to: {output_file}")

# Print unique usernames
if unique_usernames is not None:
    print("Unique Usernames:")
    for username in unique_usernames:
        print(username)
    
    # Optionally save the unique usernames to a CSV file
    usernames_output_file = os.path.join(data_directory, 'unique_usernames.csv')
    pd.Series(unique_usernames).to_csv(usernames_output_file, index=False, header=['username'])
    print(f"Unique usernames have been saved to: {usernames_output_file}")