# Data Processing

## 0. the Introduction of the Dataset Structure
The dataset 'stanford Sentiment Treebank' provided by tutors contains 8 files. <br>
The following few files are important:
1. **dataetSentence.txt**: 
2. **datasetsplit.txt**:
3. **SOStr.txt**:
4. **STree.txt**:
5. **dictionary.txt**:
6. **sentiment_labels.txt**:

讲一下我们发现的每一个文件的结构，如何根据树去构造


In [1]:
# import modules
import pandas as pd
import numpy as np
import csv

## 1.  Link content and and sentiment value by phrase_id

In [2]:
# @author  : Wei Li
# @function: Union 2 files' data by phrase id
def union():
    """Return linked phrase content and data
    save the .csv file linking label and phrase"""
    
    # Get phrase content and phrase id from dictionary.txt
    with open('stanfordSentimentTreebank/dictionary.txt') as f:
        dic_raw = f.readlines()
    # split dataset by '|' and delete '|'
    p_id_d = []
    phrase_d = []
    for i in dic_raw:
        i = i.strip('\n')
        p_id_d.append((i.split('|',1))[1])
        phrase_d.append((i.split('|',1))[0])
    
    # Get sentiment value and phrase id from sentiment_labels.txt
    with open('stanfordSentimentTreebank/sentiment_labels.txt') as f:
        sentiment_raw = f.readlines()[1:]
    # split dataset by '|' and delete '|'
    p_id_s = []
    label_s = []
    for i in sentiment_raw:
        i = i.strip('\n') # delete the \n
        p_id_s.append((i.split('|',1))[0])
        label_s.append((i.split('|',1))[1])
    
    # using phrase id, link sentiment value and phrase
    label = []
    phrase = []
    for i in range(len(p_id_d)):
        index = p_id_d.index(p_id_s[i])
        label.append(label_s[i])
        phrase.append(phrase_d[index])
        
    # make columns be a DataFrame table
    uni = pd.DataFrame({'sentiment values':label,'phrase':phrase})
    uni.to_csv('ProccessedData/phrase_label.csv',index = False)
    
    # return corresponding phrase content and sentiment value
    return(uni)

In [3]:
phrase_label = union()
phrase_label

Unnamed: 0,sentiment values,phrase
0,0.5,!
1,0.5,'
2,0.44444,' (
3,0.5,' ( the cockettes
4,0.42708,' ( the cockettes )
...,...,...
239227,0.36111,your standard Hollywood bio-pic
239228,0.38889,your typical ` fish out of water ' story
239229,0.33333,zero .
239230,0.88889,zippy jazzy score


## 2. Initialize the Structure of Trees
**Ger data by trees' struucture**<br>
In this dataset, every rows of the file 'SOStr' represent a sentence tree. <br>
Every single word represents a leaf.<br>
Every parents node in a tree represented few leaf in the tree.<br>
The phrases in dictionary contain all node in every trees (regardless of leaves or parents nodes).<br>
We need to get the content of nodes(every nodes in every trees) and the corresponding sentimental values.<br>
Besides, the whole dataset's sentences are splited into 3 sub-section to train, test and validation,and a tree is represent a sentence.<br>
- Therefore, steps are:
    - choose a dataet
    - get sentences in certain datasets
    - get the nodes of (selected by levels)
    - get the content of nodes
    - get the corresponding sentimental value

In [4]:
#!/usr/bin/python
# @author  : Wei Li
# @function: based on a trees' sturcture, get data we need

def Structure(STree,SOStr):
    """
    Return total_chain, height, leaves_num, code, words.
    total_chain: an array of paths from every leaves to root of 1 tree(leaf -> parents -> root)
    height: the height of the tree
    leaves_num: the number of leaves in this tree
    code: the recording in STree which translate to list
    words: the recording in SOStr which translate to list
    """   
    # the input SOStr and STree is a single corresponding recording in SOStr.txt and STree.txt
    # put single leaf(word) into list.
    SOStr = SOStr.strip('\n')
    words = SOStr.split('|')
    # put code value in to list.
    STree = STree.strip('\n')
    code = STree.split('|')
    # translate code value's type from str to int
    code = [int(i) for i in code]
    # find every paths from leaves to root, after attain root, the next value equal to 0.
    leaves_num = len(words)
    total_chain  = np.zeros((leaves_num,int(max(code))))
    for i in range(leaves_num): # start from leaf...
        chain = [i+1] # the first value of the chain is the code of this leaf in the tree
        index = i
        while index >= 0: 
            chain.append(code[index]) # 'code[i]' represent the parents node's code of node 'i'
            # in python, index start from 0, but in the code of a tree, index start from 1
            index = code[index]-1 # find the next upper parents, until it attain root
        for j in range(len(chain)):
            total_chain[i][j] = chain[j] # put the path into the array
    
    # delete unnecessary 0 value
    idx = np.argwhere(np.all(total_chain[..., :] == 0, axis=0))
    total_chain = np.delete(total_chain, idx, axis=1)
    # calculate the every chains' length
    chain_len = []
    for i in range(leaves_num):
        chain_len.append(len(np.nonzero(total_chain[i])[0]))
    # calculate the height of the tree
    height = max(chain_len)
    return total_chain, height, leaves_num, code, words

In [5]:
#!/usr/bin/python
# @author  : Wei Li
# @function: get every nodes' words(phrases) in a tree

def NodeWords(total_chain, code, words):
    """
    Return the list node_words. 
    node_words is a list, every elements in this list represnt a node in a tree.
    Every elements in this list contain all leaves belong to this node.
    """
    length = len(code)
    node_words = []
    for i in range(length):
        index = np.argwhere(total_chain == i+1)
        t = ''
        for j in range(len(index)):
            t += words[index[j][0]] + ' '
        t = t[:-1]
        node_words.append(t)
    return node_words

In [6]:
#!/usr/bin/python
# @author  : Wei Li
# @function: get certain level's nodes' content(phrase) of a tree

def ElementData(total_chain, height,leaves_num,code,node_words, n): 
    # n = -1: the whole tree(a sentence); n = 0: leaves; n = 1: nodes that are 1 level higher than leaves
    if n < height and n>= 0:
        Data = []
        index1 = []
        index2 = []
        for i in range(leaves_num):
            index1.append(int(total_chain[i][n]-1))
        for i in index1:
            if i not in index2 and i >= 0:
                index2.append(int(i))
        for i in index2:
            Data.append(node_words[i])
        return Data
    if n == -1:
        Data = node_words[max(code)-1]
        return Data
    else:
        print("The level is out of this tree's height")

In [7]:
#!/usr/bin/python
# @author  : Wei Li
# @function: get certain level's nodes' content(phrase) of all trees

def TotalData(n):
    with open('stanfordSentimentTreebank/STree.txt') as f:
        STree_raw = f.readlines()
    with open('stanfordSentimentTreebank/SOStr.txt') as f:
        SOStr_raw = f.readlines()
        TotalData = []
    lenth = len(SOStr_raw)
    for i in range(lenth):
        total_chain, height, leaves_num, code, words= Structure(STree_raw[i],SOStr_raw[i])
        node_words = NodeWords(total_chain, code, words)
        Data = ElementData(total_chain, height, leaves_num,code,node_words,n)
        TotalData.append(Data)
    return TotalData

In [218]:
# # ----------------------- HERE ----------------------- #
# def MakeTree(STree,SOStr):
#     total_chain, height, leaves_num, code, words = Structure(STree,SOStr)
#     class TreeNode:
#         def __init__(self,x):
#             self.value = x
#             self.left = None
#             self.right = None
#     total_chain, height, leaves_num, code, words = Structure(STree,SOStr)
#     Totol_Num = int(max(total_chain[0]))
#     No = []
#     for i in range(1,Totol_Num+1):
#         No.append(str('No'+str(i)))
#     print(No)
#     for i in range(1,Totol_Num+1):
#         eval(str('No'+i)) = TreeNode(i)
#     for i in range(1,Totol_Num):
#         TreeNode(i).left = eval(No[i-1])
# #     for i in range(leaves_num): # rwo
# # #         print('第',i,'行')
# #         for j in range(1,height): # column
# # #             print('第',j,'个')
# #             print(total_chain[i][j])
# #             if total_chain[i][j] != 0:
# #                 if TreeNode(int(total_chain[i][j])).left == None:
# #                     TreeNode(int(total_chain[i][j])).left = TreeNode(int(total_chain[i][j-1]))
# #                 else:
# #                     TreeNode(int(total_chain[i][j])-1).right = TreeNode(int(total_chain[i][j-1]))
#     for i in range(1,13):
#         print(i,':',TreeNode(i).left,TreeNode(i).right,TreeNode(i).value)
        

# SOStr = SOStr.strip('\n')
#     words = SOStr.split('|')
#     # put code value in to list.
#     STree = STree.strip('\n')
#     code = STree.split('|')

# def LRD(STree,SOStr):
#     total_chain, height, leaves_num, code, words = Structure(STree,SOStr)
#     MakeTree(total_chain)

In [217]:
# with open('stanfordSentimentTreebank/STree.txt') as f:
#     STree_raw = f.readlines()
# with open('stanfordSentimentTreebank/SOStr.txt') as f:
#     SOStr_raw = f.readlines()
# total_chain, height, leaves_num, code, words = Structure(STree_raw[133],SOStr_raw[133])  

# MakeTree(STree_raw[133],SOStr_raw[133])  
# print(total_chain)
# # print(len(total_chain))
# # print(height)

## 3.  Devide the Dataset

In [8]:
#!/usr/bin/python
# @author  : Wei Li
# @function: divide dataset by datasetSplit.txt

def DivideDataset(n): # n = 1 or n = 2 or n = 3
    with open('stanfordSentimentTreebank/datasetSplit.txt') as f:
        raw = f.readlines()[1:]
    set = []
    sentence_id = []
    for i in raw:
        i = i.strip('\n') # delete the \n
        sentence_id.append((i.split(',',1))[0])
        set.append((i.split(',',1))[1])
    set = np.array(set,dtype='int')
    sentence_id = np.array(sentence_id,dtype='int')
    select_id = []
    for i in range(len(set)):
        if set[i] == n:
            select_id.append(sentence_id[i])
    return(select_id)

## 4. Choosing needed data by dataset and phrase' level

In [9]:
#!/usr/bin/python
# @author  : Wei Li
# @function: connect data we need with corresponding label

def GetDatasetContent(dataset,level):
    Content_raw = TotalData(level)
    Sentence_id = DivideDataset(dataset)
    Content = []
    for i in Sentence_id:
        Content.append(Content_raw[i-1])
    return(Content)

In [38]:
def FindLabel(Content):
#     with open('Unsupervised/ProccessedData/phrase_label.csv'):
#         lp = pd.read_csv('phrase_label.csv') # label_phrase
    lp_l = [i for i in phrase_label['sentiment values']]
    lp_p = [i for i in phrase_label['phrase']] 
    
    length = len(Content)
    phrase = []
    label = []
    for i in range(length):
        element = Content[i]
        e_length = len(element)
        for j in range(e_length):
            index = lp_p.index(Content[i][j])
            phrase.append(lp_p[index])
            label.append(lp_l[index])
    return phrase,label

def FindLabelSentence(Content):
#     with open('ProccessedData/phrase_label.csv'):
#         lp = pd.read_csv('phrase_label.csv') # label_phrase
    lp_l = []
    lp_p = []
    lp_l = [i for i in phrase_label['sentiment values']]
    lp_p = [i for i in phrase_label['phrase']] 
    length = len(Content)
    phrase = []
    label = []
    for i in range(length):
        element = Content[i]
        index = lp_p.index(element)
        phrase.append(lp_p[index])
        label.append(lp_l[index])
    return phrase,label

## 5. Get Specific Data, Save Them into Files

In [18]:
# dataset 1
Singal_word_1 = GetDatasetContent(1,0)
Level1_1 = GetDatasetContent(1,1)
Sentences_1 = GetDatasetContent(1,-1)

In [37]:
phrase_s1,label_s1 = FindLabelSentence(Sentences_1)
df= pd.DataFrame({'label':label_s1,'phrase':phrase_s1})
df.to_csv("ProccessedData/sentence_dataset1.csv",index=False)

In [40]:
phrase_01,label_01 = FindLabel(Singal_word_1)
df= pd.DataFrame({'label':label_01,'phrase':phrase_01})
df.to_csv("ProccessedData/level0_dataset1.csv",index=False)

## 6. Clean Data

In [41]:
def clean_word(Doc_Name:'str') :#-> list: 
    # get the file path
    stopwords_dict = os.listdir("stopwords/")
    # add stop words from files to list
    stop_list = []
    for dic in stopwords_dict:
        stop_list.append([line.strip() for line in open('stopwords/' + dic, 'r').readlines()])
    
    # delete second bracket
    stop_words = ' '.join(str(i) for i in stop_list)

    # get raw data from dataset(single word)
    with open (Doc_Name):
        raw_data = pd.read_csv(Doc_Name,encoding='gbk')
    
    # stop word clean
    clean_phrase = []
    clean_label = []

    for i in range(len(raw_data)):
        if raw_data['phrase'][i] not in stop_words:
            clean_phrase.append(raw_data['phrase'][i])
            clean_label.append(raw_data['label'][i])
    
    # save cleaned file
    df= pd.DataFrame({'label':clean_label,'phrase':clean_phrase})
    df.to_csv(Doc_Name[:-4]+'_cleaned.csv',index=False)

In [42]:
def clean_repetition(Doc_Name:'str'):
    with open (Doc_Name):
        raw_data = pd.read_csv(Doc_Name) #encoding='gbk'
    clean_phrase = []
    clean_label = []
    test2 = []
    for i in range(len(raw_data)):
        if raw_data['phrase'][i] not in clean_phrase:
            clean_phrase.append(raw_data['phrase'][i])
            clean_label.append(raw_data['label'][i])
        else:
            test2.append(raw_data['phrase'][i])
    df= pd.DataFrame({'label':clean_label,'phrase':clean_phrase})
    df.to_csv(Doc_Name[:-4]+'_cle_rep.csv',index=False)