In [None]:
import pandas as pd

# Load sentences
sentences = pd.read_csv("datasetSentences.txt", sep="\t", names=["sentence_index", "sentence"], skiprows=1)


In [None]:
# Load dictionary
dictionary = pd.read_csv("dictionary.txt", sep="|", names=["phrase", "phrase_id"])

# Load sentiment labels
sentiment_labels = pd.read_csv("sentiment_labels.txt", sep="|", names=["phrase_id", "sentiment_score"])

# Convert phrase_id in both DataFrames to string
dictionary["phrase_id"] = dictionary["phrase_id"].astype(str)
sentiment_labels["phrase_id"] = sentiment_labels["phrase_id"].astype(str)

# Merge to get phrases with sentiment scores
phrase_sentiments = pd.merge(dictionary, sentiment_labels, on="phrase_id")


In [None]:
# Load split information
splits = pd.read_csv("datasetSplit.txt", sep=",", names=["sentence_index", "split_label"], skiprows=1)


In [None]:
# Merge sentences with splits
sentences_splits = pd.merge(sentences, splits, on="sentence_index")

# Merge sentences with sentiment labels
final_data = pd.merge(sentences_splits, phrase_sentiments, left_on="sentence", right_on="phrase", how="inner")
final_data["sentiment_score"] = pd.to_numeric(final_data["sentiment_score"], errors="coerce")

# Add labels based on sentiment score
def map_sentiment_label(score):
    if score <= 0.2:
        return 0  # Very negative
    elif score <= 0.4:
        return 1  # Negative
    elif score <= 0.6:
        return 2  # Neutral
    elif score <= 0.8:
        return 3  # Positive
    else:
        return 4  # Very positive

final_data["label"] = final_data["sentiment_score"].apply(map_sentiment_label)


In [None]:
# Create train, test, and dev splits
train_data = final_data[final_data["split_label"] == 1]
test_data = final_data[final_data["split_label"] == 2]
dev_data = final_data[final_data["split_label"] == 3]


In [None]:
# Save splits to CSV (optional)
train_data[["sentence", "label"]].to_csv("sst2_train.csv", index=False)
test_data[["sentence", "label"]].to_csv("sst2_test.csv", index=False)
dev_data[["sentence", "label"]].to_csv("sst2_dev.csv", index=False)
