In [8]:
import os
import sys

import pandas


def get_phrase_sentiments(base_directory):
    def group_labels(label):
        if label in ["very negative", "negative"]:
            return "negative"
        elif label in ["positive", "very positive"]:
            return "positive"
        else:
            return "neutral"

    dictionary = pandas.read_csv(os.path.join(base_directory, "dictionary.txt"), sep="|")
    dictionary.columns = ["phrase", "id"]
    dictionary = dictionary.set_index("id")
    
    sentiment_labels = pandas.read_csv(os.path.join(base_directory, "sentiment_labels.txt"), sep="|")
    sentiment_labels.columns = ["id", "sentiment"]
    sentiment_labels = sentiment_labels.set_index("id")
    phrase_sentiments = dictionary.join(sentiment_labels)
    
    phrase_sentiments["fine"] = pandas.cut(phrase_sentiments.sentiment, [0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                           include_lowest=True,
                                           labels=["very negative", "negative", "neutral", "positive", "very positive"])
    phrase_sentiments["coarse"] = phrase_sentiments.fine.apply(group_labels)
    return phrase_sentiments


def get_sentence_partitions(base_directory):
    sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t") 
    splits = pandas.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
    return sentences.join(splits).set_index("sentence")


def partition(base_directory):
    phrase_sentiments = get_phrase_sentiments(base_directory)
#     print('phrase_sentiments \n ___________\n',phrase_sentiments)
#     print('\n --------')
    sentence_partitions = get_sentence_partitions(base_directory)
#     print('sentence_partitions \n ___________\n',sentence_partitions)
#     print('\n --------')
    # noinspection PyUnresolvedReferences
    data = phrase_sentiments.join(sentence_partitions, on="phrase")
    print('after joined \n ___________\n', data)
    # set all the ones without split labels into train set(this particularly includes phrases)
    data["splitset_label"] = data["splitset_label"].fillna(1).astype(int)
    print('are there nulls????????????????????',pd.isna(data["sentiment"]))
    data["phrase"] = data["phrase"].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))
    return data.groupby("splitset_label")


base_directory, output_directory = './','./'
os.makedirs(output_directory, exist_ok=True)
for splitset, partition in partition(base_directory):
    print('partition \n ___________\n',partition)
    print('\n --------')
    split_name = {1: "train", 2: "test", 3: "dev"}[splitset]
    filename = os.path.join(output_directory, "stanford-sentiment-treebank.%s.csv" % split_name)
    # delete the split label column and save to a separate file
    del partition["splitset_label"]
    partition.to_csv(filename)

after joined 
 ___________
                                                    phrase  sentiment  \
id                                                                     
22935                                                 ! '    0.52778   
18235                                                ! ''    0.50000   
179257                                             ! Alas    0.44444   
22936                                         ! Brilliant    0.86111   
40532                                       ! Brilliant !    0.93056   
...                                                   ...        ...   
220441  zoning ordinances to protect your community fr...    0.13889   
179256                                          zzzzzzzzz    0.19444   
220442                                               élan    0.51389   
220443                                                  É    0.50000   
220444                   É um passatempo descompromissado    0.50000   

                 fine    coarse  sp

In [9]:
sentences = pandas.read_csv(os.path.join(base_directory, "datasetSentences.txt"), index_col="sentence_index",
                                sep="\t")


Unnamed: 0_level_0,sentence
sentence_index,Unnamed: 1_level_1
1,The Rock is destined to be the 21st Century 's...
2,The gorgeously elaborate continuation of `` Th...
3,Effective but too-tepid biopic
4,If you sometimes like to go to the movies to h...
5,"Emerges as something rare , an issue movie tha..."
...,...
11851,A real snooze .
11852,No surprises .
11853,We 've seen the hippie-turned-yuppie plot befo...
11854,Her fans walked out muttering words like `` ho...
