In [5]:
import re
from collections import Counter

# Load the SST2 dataset from the local file
file_path = 'sst2_train.txt'  # Replace with the actual path to your local SST2 dataset

# Read the file and split it into labels and sentences
data = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        label, sentence = line.strip().split('\t')
        data.append((label, sentence))

# Check the first few samples to verify
print("Sample data:", data[:3])


Sample data: [('0', 'hide new secretions from the parental units'), ('0', 'contains no wit , only labored gags'), ('1', 'that loves its characters and communicates something rather beautiful about human nature')]


In [6]:
# Function to tokenize sentences using regex
def tokenize_sentence(sentence):
    return re.findall(r'\b\w+\b', sentence.lower())  # Convert to lowercase and tokenize

# Tokenize the sentences in the dataset
tokenized_data = [(label, tokenize_sentence(sentence)) for label, sentence in data]


In [7]:
# Flatten all tokenized sentences to get a list of all words
all_words = [word for _, sentence in tokenized_data for word in sentence]

# Compute word frequencies
word_freq = Counter(all_words)

# Define rare words as those appearing less than 2 times
rare_threshold = 2
rare_words = {word for word, count in word_freq.items() if count < rare_threshold}

print("Rare words:", rare_words)


Rare words: {'honks', 'lodge', 'nia', 'mingles', 'depalma', 'masseur', '1970', 'impressionable', 'subcultures', 'unrepentant', 'shut', 'strategy', 'duties', 'irreverence', 'command', 'mama', 'elfriede', 'cloaks', 'tyson', 'baio', 'berg', 'colonialism', 'forrest', 'skateboard', 'xtc', 'claire', 'mariah', 'horton', 'fifties', 'mccoist', 'lillard', 'komediant', 'occurrences', 'philosophically', 'maggio', 'unseen', 'latino', 'hyang', 'digressions', 'photographer', 'cutthroat', 'byplay', 'appétit', 'mar', 'campion', 'colosseum', 'camps', 'ces', 'invaders', 'forever', 'monroe', 'remade', 'spree', 'cadavers', 'burke', 'kilted', 'efteriades', 'recreated', '1972', 'overburdened', 'pelosi', 'conforms', 'fraser', 'lux', 'hanky', 'pad', 'sallies', 'depends', 'parris', 'lantern', 'grenier', 'xiaoshuai', 'staggering', 'shunji', 'nostra', '51st', 'kirshner', 'treacle', 'hawley', 'assailants', 'uwe', 'ballast', 'jeopardy', 'shohei', 'epps', 'cortez', 'giler', '270', 'kiarostami', 'cockeyed', 'jelinek'

In [8]:
# Select sentences that contain rare words
preservation_set = []
for label, sentence in tokenized_data:
    if any(word in rare_words for word in sentence):
        preservation_set.append((label, " ".join(sentence)))

# Print the selected sentences
print("\nPreservation Set:")
for label, sentence in preservation_set:
    print(f"Label: {label}, Sentence: {sentence}")



Preservation Set:
Label: 1, Sentence: if anything see it for karen black who camps up a storm as a fringe feminist conspiracy theorist named dirty dick
Label: 0, Sentence: poor ben bratt could n t find stardom if mapquest emailed him point to point driving directions
Label: 1, Sentence: khouri manages with terrific flair to keep the extremes of screwball farce and blood curdling family intensity on one continuum
Label: 0, Sentence: nicks and steinberg match their own creations for pure venality that s giving it the old college try
Label: 1, Sentence: the picture runs a mere 84 minutes but it s no glance
Label: 1, Sentence: ub equally spoofs and celebrates the more outre aspects of black culture and the dorkier aspects of white culture even as it points out how inseparable the two are
Label: 0, Sentence: an already thin story boils down to surviving invaders seeking an existent anti virus
Label: 1, Sentence: those eternally devoted to the insanity of black will have an intermittently g

In [9]:
# Save the preservation set to a text file
with open('sst2_preservation_set.txt', 'w', encoding='utf-8') as f:
    for label, sentence in preservation_set:
        f.write(f"{label}\t{sentence}\n")

print("Preservation set saved to 'sst2_preservation_set.txt'.")



Preservation set saved to 'sst2_preservation_set.txt'.
