    Note:
    - Change the folder_paths to run the code in your environment

## Merge Data

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import re
from typing import Dict, Tuple
import random
import numpy as np

In [None]:
folder_paths = [
    "path1",
    "path2"
]

# Get all CSV files from  folders
csv_files = [(folder, file) for folder in folder_paths for file in os.listdir(folder)
              if file.endswith(".csv")]

# Read and process all CSV files
df_list = [pd.read_csv(os.path.join(folder, file))
            # .drop(columns=["tupleID","frequency"], errors="ignore")
            [['userID', 'resourceID', 'operation']]
            for folder, file in csv_files]

In [None]:
data_fr = pd.concat(df_list, ignore_index=True)
data_fr.drop_duplicates(inplace=True) #To introduce unseen data concept for universal cross validation

In [None]:
# Display the first few rows
print("Dataset size: ",len(data_fr))
data_fr.head()

Dataset size:  6086


Unnamed: 0,userID,resourceID,operation
0,registrar2,csStu1trans,read
1,eeFac1,ee101gradebook,readScore
2,eeFac2,ee601gradebook,readScore
3,registrar1,csStu1trans,read
4,csFac1,cs101gradebook,changeScore


In [None]:
# Split the data (80% train of complete dataset, 10% test, 10%validation of unique instances TO AVOID CHEATING)
train_df, test_df = train_test_split(data_fr, shuffle=True ,test_size=0.2, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

print("Train: ",len(train_df))
print("Test: ",len(test_df))
print("Validation: ",len(val_df))

Train:  4868
Test:  609
Validation:  609


## Pattern Inference

In [None]:
# ====== PATTERN INFERENCE FUNCTIONS ======

def extract_numeric_info(identifier: str) -> Tuple[str, str, str, str]:
    """Extracts prefix (category), course, department, and keeps extra text like 'application'."""
    match = re.match(r"([a-zA-Z]+)(\d+)-(\d+)", identifier)  # Match format 'typeX-Y'
    if match:
        return match.group(1).lower(), match.group(2), match.group(3), "-"  # Course first, department second
    match_single = re.match(r"([a-zA-Z]+)(\d+)([a-zA-Z]*)", identifier)  # Match format 'typeXtext'
    if match_single:
        return match_single.group(1).lower(), match_single.group(2), "-", match_single.group(3).lower()  # Keep extra text
    return identifier.lower(), "-", "-", "-"


In [None]:
def infer_user_patterns(user: str) -> Dict[str, str]:
    """Infers user role, department, and course attributes."""
    patterns = {}
    user_type, course, dept, _ = extract_numeric_info(user)

    role_patterns = {
        'stu': 'student',
        'fac': 'faculty',
        'chair': 'chair',
        'registrar': 'registrar',
        'admissions': 'admissions',
        'app': 'applicant'
    }

    role = next((role_patterns[key] for key in role_patterns if user_type.startswith(key)), "-")

    if role == '-':
      if "chair" in user_type:
        role = role_patterns["chair"]
      elif "stu" in user_type:
        role = role_patterns["stu"]
      elif "fac" in user_type:
        role = role_patterns["fac"]

    patterns['role'] = role
    patterns['department'] = dept
    patterns['course'] = course

    return patterns

In [None]:
def infer_resource_patterns(resource: str) -> Dict[str, str]:
    """Infers resource type, department, and course attributes."""
    patterns = {}
    resource_type, course, dept, extra = extract_numeric_info(resource)

    resource_types = {
        'roster': 'roster',
        'transcript': 'academic_transcript',
        'application': 'student_application',
        'gradebook': 'gradebook',
        'stu': 'student_application'
    }

    resource = next((resource_types[key] for key in resource_types if resource_type.startswith(key)), "-")

    if resource == '-':
      if "trans" in [resource_type, extra]:
        resource = resource_types["transcript"]
      elif "gradebook" in [resource_type, extra]:
        resource = resource_types["gradebook"]
      elif "application" in [resource_type, extra]:
        resource = resource_types["application"]
      elif "roster" in [resource_type, extra]:
        resource = resource_types["roster"]

    patterns['type'] = resource
    patterns['department'] = dept
    patterns['course'] = course

    return patterns

In [None]:
def course_taught(user_course: str, resource_course: str) -> str:
    return "crs" if user_course == resource_course and "-" not in user_course else "NOT_crs"

In [None]:
# ====== ANNOTATION FUNCTION ======

def annotate_access_scope(df: pd.DataFrame) -> pd.DataFrame:
    """Annotates a DataFrame with inferred roles, resource types, and access scope."""
    user_role_list = []
    resource_type_list = []
    user_department_list = []
    resource_department_list = []
    user_course_list = []
    resource_course_list = []

    for _, row in df.iterrows():
        user = row['userID']
        resource = row['resourceID']
        operation = row['operation']

        user_info = infer_user_patterns(user)
        res_info = infer_resource_patterns(resource)

        user_role_list.append(user_info.get('role', '-'))
        resource_type_list.append(res_info.get('type', '-'))
        user_department_list.append(user_info.get('department', '-'))
        resource_department_list.append(res_info.get('department', '-'))
        user_course_list.append(user_info.get('course', '-'))
        resource_course_list.append(res_info.get('course', '-'))

    df['user_role'] = user_role_list
    df['resource_type'] = resource_type_list
    df['user_department'] = user_department_list
    df['resource_department'] = resource_department_list
    df['user_course'] = user_course_list
    df['resource_course'] = resource_course_list

    df['crs_taught'] = df.apply(lambda row: course_taught(row['user_course'], row['resource_course']), axis=1)

    return df

### Assign True Labels

In [None]:
def assign_true_label(data_df: pd.DataFrame) -> pd.DataFrame:
    """Validates access for each row in the dataset."""
    results = []

    for _, row in data_df.iterrows():
        access_granted = validate_access(row['user_role'], row['resource_type'], row['operation'], row['crs_taught'])
        results.append(access_granted)

    data_df['access_granted'] = results  # Append results to dataset
    return data_df

def validate_access(user: str, resource: str, action: str, crs_taught:str) -> bool:

    # Access validation using role-based logic
    if user == "student" and resource == "gradebook" and action in ["readMyScores", "readScore"]:
        return crs_taught == 'crs'

    elif user == "faculty" and resource == "gradebook" and action in ["readScore", "read","assignGrade", "addScore", "changeScore"]:
        return crs_taught == 'crs'

    elif user == "registrar" and resource == "roster" and action in ["read", "write"]:
        return True  # Registrar can modify all rosters

    elif user == "faculty" and resource == "roster" and action in ["readScore", "read", "assignGrade"]:
        return crs_taught == 'crs'

    elif user == "student" and resource == "academic_transcript" and action == "read":
        return crs_taught == 'crs'

    elif user == "chair" and resource in ["academic_transcript", "student_application"] and action in ["read", "readScore"]:
        return crs_taught == 'crs'

    elif user == "registrar" and resource == "academic_transcript" and action == "read":
        return True  # Registrar can read all transcripts

    elif user == "student" and resource == "student_application" and action == "checkStatus":
        return crs_taught == 'crs'

    elif user == "applicant" and resource == "student_application" and action == "checkStatus":
        return crs_taught == 'crs'

    elif user == "admissions" and resource == "student_application" and action in ["read", "setStatus", "checkStatus"]:
        return True  # Admissions can read and modify applications

    return False # Default denial if no matching rule


In [None]:
def remove_noise(df: pd.DataFrame)-> pd.DataFrame:
  clean_df = df.copy()
  clean_df = clean_df[clean_df['access_granted'] != False]
  clean_df = clean_df.drop(columns =['access_granted'])
  return clean_df

## Run Code

### Run V1

In [None]:
for run_script, df in [("train", train_df), ("test", test_df), ("val", val_df)]:
  if run_script == 'test':
    file_name = 'HYPER_university_test_data.csv'
    data = df

    # Step 2: Annotate with access patterns
    annotated_df = annotate_access_scope(data)
    annotated_df = assign_true_label(annotated_df)
    print("\nSize with noise:    ",len(annotated_df))

  elif run_script == 'train':
    file_name = 'HYPER_university_train_data.csv'
    data = df

    # Step 2: Annotate with access patterns
    annotated_df = annotate_access_scope(data)
    annotated_df = assign_true_label(annotated_df)

    print("Size with noise:    ",len(annotated_df))
    annotated_df = remove_noise(annotated_df)
    print("Size without noise: ",len(annotated_df))

  elif run_script == 'val':
    file_name = 'HYPER_university_val_data.csv'
    data = df

    # Step 2: Annotate with access patterns
    annotated_df = annotate_access_scope(data)
    annotated_df = assign_true_label(annotated_df)
    print("\nSize with noise:    ",len(annotated_df))

  # Step 3: Save to file
  parent_path = os.path.dirname(folder_paths[0])
  output_file = os.path.join(parent_path, file_name)
  annotated_df.to_csv(output_file, index=False)
  print(f"File Saved")

Size with noise:     4868
Size without noise:  4419
File Saved

Size with noise:     609
File Saved

Size with noise:     609
File Saved
