In [None]:
import os
import pandas as pd

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
tag_path = os.path.join(project_root, "data", "raw", "K9.instance.tags")
output_path = os.path.join(project_root, "data", "processed", "cleaned_tags.csv")

tags = []

with open(tag_path, "r") as f:
    for line in f:
        tag = line.strip().strip('%')
        tags.append(tag)

cleaned_tags_df = pd.DataFrame({'id': list(range(len(tags))), 'tags': tags})
cleaned_tags_df.to_csv(output_path, index=False)

print(f"Cleaned tags saved to {output_path}")
print("Preview:")
print(cleaned_tags_df.head())

In [None]:
import pandas as pd
import os

# Step 1: Locate project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Step 2: Define paths
data_path = os.path.join(project_root, "data", "raw", "K9.data")
tag_path = os.path.join(project_root, "data", "processed", "cleaned_tags.csv")

# Step 3: Load raw feature data (no header in K9.data)
X = pd.read_csv(data_path, header=None)

# Step 4: Load cleaned tag data
tags_df = pd.read_csv(tag_path)

# Step 5: Add ID and tags to feature data
X['id'] = tags_df['id']
X['tags'] = tags_df['tags']

# Step 6: Preview
print("Loaded data shape:", X.shape)
X.head()

In [None]:
# Re-load with better handling of missing values and memory usage
chunk_list = []
for chunk in pd.read_csv(data_path, header=None, na_values='?', low_memory=False, chunksize=1000):
    chunk_list.append(chunk)
X = pd.concat(chunk_list, ignore_index=True)

# Drop the last two columns if they are misaligned
if X.shape[1] > 5410:
    X = X.iloc[:, :5410]

# Re-attach id and tags from cleaned_tags.csv
X['id'] = tags_df['id']
X['tags'] = tags_df['tags']

print("Cleaned and realigned data shape:", X.shape)
X.head()

In [None]:
# Count how many tags per instance (split by underscore)
X['num_tags'] = X['tags'].apply(lambda x: len(x.split('_')) if isinstance(x, str) else 0)

# Preview the distribution
print("Distribution of tag counts per instance:")
print(X['num_tags'].value_counts())

# Flatten the tag list to analyze frequency
from collections import Counter

# Combine all tags across rows into one flat list
all_tags = [tag for row in X['tags'].dropna() for tag in row.split('_')]

# Count tag frequency
tag_counts = Counter(all_tags)

# Convert to DataFrame for display
tag_freq_df = pd.DataFrame(tag_counts.items(), columns=['tag', 'count']).sort_values(by='count', ascending=False)

print("Top 10 most common mutation tags:")
print(tag_freq_df.head(10))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot top 15 tags (clean version)
plt.figure(figsize=(12, 6))
sns.barplot(data=tag_freq_df.head(15), x='tag', y='count', hue='tag', palette='magma', legend=False)
plt.title('Top 15 Most Frequent Mutation Tags')  # Removed emoji
plt.xlabel('Mutation Tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Step 5.1: Split tags into a list of individual mutations
X['tag_list'] = X['tags'].str.split('_')

# Step 5.2: Initialize binarizer and fit
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(X['tag_list'])

# Optional: Store the tag names for later use (e.g., inverse transform)
tag_classes = mlb.classes_

# Display the encoded matrix shape
print("Encoded label matrix shape:", Y.shape)
print("Sample encoded vector (row 0):", Y[0])

In [None]:
import os
import numpy as np

# Define the required folder path ending
REQUIRED_PATH_SUFFIX = os.path.join("USD-Term1-AppliedAI-GroupSynergy", "data", "processed")

def find_correct_processed_path():
    current_path = os.getcwd()
    while True:
        # Check if this directory ends with the required suffix
        candidate = os.path.join(current_path, REQUIRED_PATH_SUFFIX)
        if os.path.isdir(candidate):
            return candidate
        # Move up one directory
        parent = os.path.dirname(current_path)
        if parent == current_path:
            raise FileNotFoundError(f"Couldn't find required path ending with '{REQUIRED_PATH_SUFFIX}'")
        current_path = parent

# Find the correct data/processed path
processed_dir = find_correct_processed_path()
os.makedirs(processed_dir, exist_ok=True)

# Clean tag_list column if exists
if 'tag_list' in X.columns:
    X = X.drop(columns=['tag_list'])

# Save everything in the correct location
X.to_csv(os.path.join(processed_dir, 'cleaned_features.csv'), index=False)
np.save(os.path.join(processed_dir, 'encoded_labels.npy'), Y)

with open(os.path.join(processed_dir, 'tag_classes.txt'), 'w') as f:
    f.write('\n'.join(tag_classes))

print(f"All files saved to: {processed_dir}")