In [3]:
from collections import defaultdict, Counter

# Load the names dataset from a text file (each name is on a new line)
file_path = '/home/mohammad/Safety-Driven-Self-Compressing-Neural-Networks/Neural Probablistic /data/names.txt'  # Replace this with your actual file path

# Read the file and store each name
with open(file_path, 'r', encoding='utf-8') as file:
    data = [line.strip().lower() for line in file]

# Step 1: Calculate character frequencies and character-pair transition frequencies
char_freq = Counter()
pair_freq = defaultdict(Counter)

# Update the frequency counts for characters and consecutive character pairs
for name in data:
    for i in range(len(name)):
        char_freq[name[i]] += 1
        if i < len(name) - 1:
            pair_freq[name[i]][name[i+1]] += 1

# Total number of characters
total_chars = sum(char_freq.values())

# Step 2: Calculate the probabilities of each character and character-pair transition
char_probs = {char: freq / total_chars for char, freq in char_freq.items()}
pair_probs = {}

for char, following_chars in pair_freq.items():
    total_pairs = sum(following_chars.values())
    pair_probs[char] = {next_char: count / total_pairs for next_char, count in following_chars.items()}

# Step 3: Function to calculate the average transition probability for each name
def calculate_average_transition_probability(name, char_probs, pair_probs):
    total_prob = 0
    for i in range(len(name) - 1):
        current_char = name[i]
        next_char = name[i + 1]
        if current_char in pair_probs and next_char in pair_probs[current_char]:
            total_prob += pair_probs[current_char][next_char]
        else:
            total_prob += 0  # If the transition is unseen, treat it as zero probability
    return total_prob / (len(name) - 1)  # Average probability for the name

# Step 4: Identify the names with the weakest character transition probabilities
name_probabilities = []
for name in data:
    avg_prob = calculate_average_transition_probability(name, char_probs, pair_probs)
    name_probabilities.append((name, avg_prob))

# Sort names by their average transition probability (ascending)
name_probabilities.sort(key=lambda x: x[1])

# Step 5: Select the top 1000 hardest examples (names with the weakest probabilities)
preservation_set = name_probabilities[:1000]  # Select the top 1000 hardest examples

# Save the hardest examples to a text file
output_file = 'hardest_examples.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    for name, prob in preservation_set:
        f.write(f"{name}\n")

# Display confirmation
print(f"Saved the top 1000 hardest examples to '{output_file}'.")


Saved the top 1000 hardest examples to 'hardest_examples.txt'.
