### 1, Load and extract the data

In [1]:
### extracting the entities and tags

import os 
from keras.preprocessing.sequence import pad_sequences


def split_text_label(filename):
    '''
    Reads a file named filename, extracts the text and the labels and stores
    them in an array.
     
    returns [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] 
    '''
    
    # open file
    f = open(filename)
    
    # initializing
    split_labeled_text = []
    sentence = []
    
    # processing line by line 
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\t":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        
        # split by tab
        splits = line.split('	')
        # rstrip: strip from the right 
        sentence.append([splits[0],splits[-1].rstrip("\n")])
        
        
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text

# change training set 
split_train = split_text_label(os.path.join("train_10_25.txt"))
split_valid = split_text_label(os.path.join("valid_10_25.txt"))
split_test = split_text_label(os.path.join("test_10_25.txt"))

In [2]:
# print some list to see 

print("train.txt: ", split_train[0:2])
print()

print("valid.txt: ", split_valid[0:1])
print()

print("test.txt: ", split_test[0:1])
print()

train.txt:  [[['literature', 'O'], ['linking', 'O'], ['the', 'O'], ['effects', 'O'], ['of', 'O'], ['prepartum', 'B-PER'], ['nutrition', 'B-NUT'], ['and', 'O'], ['subsequent', 'O'], ['fertility', 'O'], ['is', 'O'], ['scarce', 'O']], [['most', 'O'], ['of', 'O'], ['what', 'O'], ['is', 'O'], ['suggested', 'O'], ['to', 'O'], ['optimize', 'O'], ['future', 'O'], ['fertility', 'O'], ['is', 'O'], ['related', 'O'], ['to', 'O'], ['relationships', 'O'], ['between', 'O'], ['metabolic', 'B-DIS'], ['disorders', 'I-DIS'], ['and', 'O'], ['risk', 'O'], ['for', 'O'], ['delayed', 'B-DIS'], ['conception', 'I-DIS']]]

valid.txt:  [[['we', 'O'], ['will', 'O'], ['see', 'O'], ['in', 'B-PER'], ['the', 'O'], ['next', 'O'], ['sections', 'O'], ['some', 'O'], ['of', 'O'], ['the', 'O'], ['etiology', 'O'], ['and', 'O'], ['preventive', 'O'], ['measures', 'O'], ['for', 'O'], ['these', 'O'], ['problems', 'O'], ['and', 'O'], ['the', 'O'], ['nutritional', 'O'], ['considerations', 'O'], ['related', 'O'], ['to', 'O'], ['the

In [4]:
# only take five sentences for testing

split_train = split_train[0:5]

print(split_train)

[[['literature', 'O'], ['linking', 'O'], ['the', 'O'], ['effects', 'O'], ['of', 'O'], ['prepartum', 'B-PER'], ['nutrition', 'B-NUT'], ['and', 'O'], ['subsequent', 'O'], ['fertility', 'O'], ['is', 'O'], ['scarce', 'O']], [['most', 'O'], ['of', 'O'], ['what', 'O'], ['is', 'O'], ['suggested', 'O'], ['to', 'O'], ['optimize', 'O'], ['future', 'O'], ['fertility', 'O'], ['is', 'O'], ['related', 'O'], ['to', 'O'], ['relationships', 'O'], ['between', 'O'], ['metabolic', 'B-DIS'], ['disorders', 'I-DIS'], ['and', 'O'], ['risk', 'O'], ['for', 'O'], ['delayed', 'B-DIS'], ['conception', 'I-DIS']], [['common', 'O'], ['metabolic', 'B-DIS'], ['problems', 'I-DIS'], ['that', 'O'], ['affect', 'O'], ['early', 'O'], ['postpartum', 'B-PER'], ['cows', 'O'], ['such', 'O'], ['as', 'O'], ['retained', 'B-DIS'], ['fetal', 'I-DIS'], ['membranes', 'I-DIS'], ['milk', 'B-DIS'], ['fever', 'I-DIS'], ['ketosis', 'I-DIS'], ['and', 'O'], ['displaced', 'B-DIS'], ['abomasum', 'I-DIS'], ['are', 'O'], ['know', 'O'], ['to', 'O'

In [5]:
# try to access nested list 

print(split_train[0][0][0], split_train[0][0][1])

literature O


### 2, Load and extract the data from the excel 

In [13]:
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile

import pandas as pd

df = pd.read_excel('Tagged_Corpus.xlsx') #you could add index_col=0 if there's an index

words = df['literature'].tolist()
tags = df['O'].tolist()

# print 
print("words: ", words[0:20])
print()
print("tags: ", tags[0:20])

words:  ['linking', 'the', 'effects', 'of', 'prepartum', 'nutrition', 'and', 'subsequent', 'fertility', 'is', 'scarce', nan, 'most', 'of', 'what', 'is', 'suggested', 'to', 'optimize', 'future']

tag:  ['O', 'O', 'O', 'O', 'B-PER', 'B-NUT', 'O', 'O', 'O', 'O', 'O', nan, 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [21]:
### try split by 'nan'

# str and split
words = str(words)
words_join = words.split('nan')
tags = str(tags)
tags_join = tags.split('nan')


# print and check 
print("words_join: ", words_join[0:5])
print()
print("tagss_join: ", tags_join[0:5])


# turn to the list 

words_join:  ["['linking', 'the', 'effects', 'of', 'prepartum', 'nutrition', 'and', 'subsequent', 'fertility', 'is', 'scarce', ", ", 'most', 'of', 'what', 'is', 'suggested', 'to', 'optimize', 'future', 'fertility', 'is', 'related', 'to', 'relationships', 'between', 'metabolic', 'disorders', 'and', 'risk', 'for', 'delayed', 'conception', ", ", 'common', 'metabolic', 'problems', 'that', 'affect', 'early', 'postpartum', 'cows', 'such', 'as', 'retained', 'fetal', 'membranes', 'milk', 'fever', 'ketosis', 'and', 'displaced', 'abomasum', 'are', 'know', 'to', 'extend', 'the', 'period', 'of', 'negative', 'energy', 'balance', 'and', 'delay', 'resumption', 'of', 'ovarian', 'cycles', ", ", 'manipulation', 'of', 'the', 'energy', 'content', 'of', 'the', 'diet', 'prepartum', 'has', 'been', 'shown', 'to', 'affect', 'dry', 'matter', 'intake', 'hayirli', 'et', 'al', ", ", '2002', 'and', 'postpartum', 'lactational', 'performance', "]

tagss_join:  ["['O', 'O', 'O', 'O', 'B-PER', 'B-NUT', 'O', 'O', 'O', '

In [23]:
### turning into list 

len(words_join) == len(tags_join)

False

In [27]:
### 1, Try adapt the NER extraction code 

def split_text_label(filename):
    '''
    Reads a file named filename, extracts the text and the labels and stores
    them in an array.
     
    returns [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] 
    '''
    
    # open file
    f = open(filename)
    
    # initializing
    sentences = []
    sentence = []
    tag = []
    tags = []
    
    # processing line by line 
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\t":
            if len(sentence) > 0:
                sentences.append(sentence)
                sentence = []
                tags.append(tag)
                tag = []
            continue
        
        # split by tab
        splits = line.split('	')
        # rstrip: strip from the right 
        sentence.append([splits[0].rstrip("\n")])
        tag.append([splits[-1].rstrip("\n")])
        
    if len(sentence) > 0:
        sentences.append(sentence)
        sentence = []
        tags.append(tag)
        tag = []
    return sentences, tags

# change training set 
sentences, tags = split_text_label(os.path.join("train_10_25.txt"))

# print and check 
print("sentences: ", sentences[0:5])
print()
print("tags: ", tags[0:5])

sentences:  [[['literature'], ['linking'], ['the'], ['effects'], ['of'], ['prepartum'], ['nutrition'], ['and'], ['subsequent'], ['fertility'], ['is'], ['scarce']], [['most'], ['of'], ['what'], ['is'], ['suggested'], ['to'], ['optimize'], ['future'], ['fertility'], ['is'], ['related'], ['to'], ['relationships'], ['between'], ['metabolic'], ['disorders'], ['and'], ['risk'], ['for'], ['delayed'], ['conception']], [['common'], ['metabolic'], ['problems'], ['that'], ['affect'], ['early'], ['postpartum'], ['cows'], ['such'], ['as'], ['retained'], ['fetal'], ['membranes'], ['milk'], ['fever'], ['ketosis'], ['and'], ['displaced'], ['abomasum'], ['are'], ['know'], ['to'], ['extend'], ['the'], ['period'], ['of'], ['negative'], ['energy'], ['balance'], ['and'], ['delay'], ['resumption'], ['of'], ['ovarian'], ['cycles']], [['manipulation'], ['of'], ['the'], ['energy'], ['content'], ['of'], ['the'], ['diet'], ['prepartum'], ['has'], ['been'], ['shown'], ['to'], ['affect'], ['dry'], ['matter'], ['in

In [34]:
### check and print the len

print(len(sentences) == len(tags))

print(len(sentences))

True
301


# 3, Try change the elements in sentences by tags: The indexing method 

### 1, Get vocab

In [38]:
### use labelSet() as label dictionary 
### use wordSet() as word dictionary 

labelSet = set()
wordSet = set()

# words and labels
for sents in sentences:
    for sent in sents:
        for word in sent:
            wordSet.add(word.lower())
            
# words and labels
for sents in tags:
    for sent in sents:
        for label in sent:            
            labelSet.add(label)
            


# modifying the label set 
print()
#empty = labelSet.pop()
#labelSet.remove('')
#labelSet.remove('BI-DIS')
#labelSet.remove('i-NUT')

# check for the len 
print("labelSet: ", len(labelSet))
print("labelSet: ", labelSet)
print()
print("wordSet: ", len(wordSet))



labelSet:  8
labelSet:  {'', 'B-DIS', 'B-PER', 'O', 'I-PER', 'B-NUT', 'I-DIS', 'I-NUT'}

wordSet:  1535


### 2, Assgining index 

In [39]:
### label2Idx  word2Idx


# sort the set to ensure '0' is assigned to 0
sorted_labels = sorted(list(labelSet), key=len)


# create mapping for labels
label2Idx = {}
for label in sorted_labels:
    label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}


# create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
    word2Idx["PADDING_TOKEN"] = len(word2Idx) # as 0
    word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) # as 1
for word in wordSet:
    word2Idx[word] = len(word2Idx)
    
# print some mapping 

print("idx2Label: ", len(idx2Label))
print("idx2Label: ", idx2Label)
print()
print("word2Idx: ", len(word2Idx))
print("word2Idx: ", word2Idx)

idx2Label:  8
idx2Label:  {0: '', 1: 'O', 2: 'B-DIS', 3: 'B-PER', 4: 'I-PER', 5: 'B-NUT', 6: 'I-DIS', 7: 'I-NUT'}

word2Idx:  1537
word2Idx:  {'PADDING_TOKEN': 0, 'UNKNOWN_TOKEN': 1, '': 2, 'content': 3, 'displacement': 4, 'urea': 5, 'extend': 6, 'dryoff': 7, 'ufl': 8, 'lactation': 9, 'libitum': 10, 'synthesis': 11, 'articles': 12, 'my': 13, 'want': 14, 'head': 15, 'abomasums ': 16, 'criteria': 17, 'explained': 18, 'prolonged': 19, 'effectiveness': 20, 'respectively': 21, 'step': 22, 'represent': 23, 'diameter': 24, 'accurate': 25, 'tried': 26, 'composed': 27, 'raizman': 28, 'five': 29, 'anionic': 30, 'culling': 31, 'alkalosis': 32, 'manipulation': 33, 'subsample': 34, 'responsible': 35, 'to': 36, 'gd': 37, 'summarized': 38, 'recommended': 39, 'ideally': 40, 'andor': 41, 'keeping': 42, 'calculated': 43, 'adult': 44, 'personnel': 45, 'but': 46, 'lameness': 47, 'aspects': 48, 'perform': 49, 'engagement': 50, 'be': 51, 'confirm': 52, 'through': 53, 'c': 54, 'clean': 55, 'may': 56, 'scient

### 3, Changing words into indexs

In [46]:
def createMatrices(line, tag, word2Idx, label2Idx):
    sentences = []
    labels = []
    
    wordIndices = []
    labelIndices = []
    
    # for the sentences
    for sents in line:
        for sent in sents:
            for word in sent:
                # if is in the vocabulary
                if word in word2Idx:
                    wordIdx = word2Idx[word]
                # if the lower case version is in the vocabulary
                elif word.lower() in word2Idx:
                    wordIdx = word2Idx[word.lower()]
                # if not, assign to the unknown token 
                else:                
                    wordIdx = word2Idx['UNKNOWN_TOKEN']
                # assign to the corrsponding index
                wordIndices.append(wordIdx)
                
                
    # for the labels
    for sents in tag:
        for sent in sents:
            for word in sent:
                # if is in the vocabulary
                if word in word2Idx:
                    # fixing a bug of '' 
                    if label != '':
                        labelIndices.append(label2Idx[label])
    

                
    # append the index to sentences
    sentences.append(wordIndices)
    labels.append(labelIndices)
    return sentences, labels

train_sentences, train_labels = createMatrices(sentences, tags, word2Idx, label2Idx)

# print to check 
print("train_sentences: ", train_sentences[0:1])
print()
print("train_labels: ", train_labels[0:1])

train_sentences:  [[85, 726, 307, 757, 244, 528, 1084, 727, 1017, 1420, 905, 232, 385, 244, 1280, 905, 1046, 36, 827, 944, 1420, 905, 1437, 36, 490, 822, 1497, 1275, 727, 456, 1170, 984, 952, 202, 1497, 591, 866, 250, 429, 700, 1156, 805, 386, 1309, 1116, 187, 1347, 1376, 938, 727, 754, 1466, 951, 1292, 36, 6, 307, 163, 244, 443, 1488, 1246, 727, 301, 1444, 244, 851, 709, 33, 244, 307, 1488, 3, 244, 307, 872, 528, 585, 444, 560, 36, 250, 364, 126, 1293, 1136, 848, 1068, 718, 727, 700, 586, 990, 1156, 677, 506, 537, 1488, 1483, 528, 1176, 573, 1488, 1246, 888, 1360, 244, 545, 323, 1054, 58, 727, 1005, 727, 888, 862, 935, 1271, 307, 275, 1520, 275, 1159, 429, 700, 585, 444, 1470, 521, 267, 1248, 1271, 307, 700, 1336, 163, 1109, 848, 1068, 1229, 727, 888, 685, 990, 1271, 493, 1264, 1156, 208, 848, 1068, 1229, 1244, 905, 1281, 36, 914, 764, 307, 443, 1237, 244, 862, 935, 1271, 307, 275, 1520, 306, 700, 1444, 244, 851, 709, 905, 622, 255, 630, 159, 1425, 244, 443, 1488, 1246, 81, 1483, 866,

### why don't just use the previous code 

# 3.2, Try change the elements in sentences by tags: The indexing method 

In [47]:
### extracting the entities and tags

import os 
from keras.preprocessing.sequence import pad_sequences


def split_text_label(filename):
    '''
    Reads a file named filename, extracts the text and the labels and stores
    them in an array.
     
    returns [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] 
    '''
    
    # open file
    f = open(filename)
    
    # initializing
    split_labeled_text = []
    sentence = []
    
    # processing line by line 
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\t":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        
        # split by tab
        splits = line.split('	')
        # rstrip: strip from the right 
        sentence.append([splits[0],splits[-1].rstrip("\n")])
        
        
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text

# change training set 
split_train = split_text_label(os.path.join("train_10_25.txt"))
split_valid = split_text_label(os.path.join("valid_10_25.txt"))
split_test = split_text_label(os.path.join("test_10_25.txt"))

In [48]:
### use labelSet() as label dictionary 
### use wordSet() as word dictionary 

labelSet = set()
wordSet = set()
# words and labels
for data in [split_train, split_valid, split_test]:
    for labeled_text in data:
        for word, label in labeled_text:
            
            # modifying unwanted mistag
            if label == 'i-NUT':
                label = 'I-NUT'
            if label == '':
                label = 'O'
            
            
            labelSet.add(label)
            wordSet.add(word.lower())
            


# modifying the label set 
print()
#empty = labelSet.pop()
#labelSet.remove('')
#labelSet.remove('BI-DIS')
#labelSet.remove('i-NUT')

# check for the len 
print("labelSet: ", len(labelSet))
print("labelSet: ", labelSet)
print()
print("wordSet: ", len(wordSet))


labelSet:  7
labelSet:  {'B-DIS', 'B-PER', 'O', 'I-PER', 'B-NUT', 'I-DIS', 'I-NUT'}

wordSet:  2012


In [49]:
### label2Idx  word2Idx


# sort the set to ensure '0' is assigned to 0
sorted_labels = sorted(list(labelSet), key=len)


# create mapping for labels
label2Idx = {}
for label in sorted_labels:
    label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}


# create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
    word2Idx["PADDING_TOKEN"] = len(word2Idx) # as 0
    word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) # as 1
for word in wordSet:
    word2Idx[word] = len(word2Idx)
    
# print some mapping 

print("idx2Label: ", len(idx2Label))
print("idx2Label: ", idx2Label)
print()
print("word2Idx: ", len(word2Idx))
print("word2Idx: ", word2Idx)

idx2Label:  7
idx2Label:  {0: 'O', 1: 'B-DIS', 2: 'B-PER', 3: 'I-PER', 4: 'B-NUT', 5: 'I-DIS', 6: 'I-NUT'}

word2Idx:  2014
word2Idx:  {'PADDING_TOKEN': 0, 'UNKNOWN_TOKEN': 1, 'content': 2, 'displacement': 3, 'urea': 4, 'extend': 5, 'dryoff': 6, 'ufl': 7, '\n': 8, 'lactation': 9, 'libitum': 10, 'vs': 11, 'maternal': 12, 'overall': 13, 'synthesis': 14, 'articles': 15, 'reduces': 16, 'my': 17, 'supplying': 18, 'want': 19, 'head': 20, 'abomasums ': 21, 'seeing': 22, 'criteria': 23, 'explained': 24, 'prolonged': 25, 'effectiveness': 26, 'respectively': 27, 'step': 28, 'those': 29, 'accrued': 30, 'represent': 31, 'diameter': 32, 'accurate': 33, 'tried': 34, 'composed': 35, 'raizman': 36, 'five': 37, 'anionic': 38, 'culling': 39, 'curve': 40, 'alkalosis': 41, '34': 42, 'recognize': 43, 'halt': 44, 'manipulation': 45, 'subsample': 46, 'responsible': 47, 'to': 48, 'gd': 49, 'summarized': 50, 'recommended': 51, 'ideally': 52, 'dynamics': 53, 'andor': 54, 'keeping': 55, 'calculated': 56, 'adult'

In [84]:
def createMatrices(data, word2Idx, label2Idx):
    sentences = []
    labels = []
    
    # get data 
    for split_labeled_text in data:
        wordIndices = []
        labelIndices = []
        
        # get word and data
        for word, label in split_labeled_text:
            
            # if is in the vocabulary
            if word in word2Idx:
                wordIdx = word2Idx[word]
                
            # if the lower case version is in the vocabulary
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()] 
                
            # if not, assign to the unknown token 
            else:                
                wordIdx = word2Idx['UNKNOWN_TOKEN']
                
            # assign to the corrsponding index
            wordIndices.append(wordIdx)
            
            # fixing a bug of '' 
            if label != '':
                labelIndices.append(label2Idx[label])
    
        # append the index to sentences
        sentences.append(wordIndices)
        labels.append(labelIndices)
    return sentences, labels

train_sentences, train_labels = createMatrices(split_train, word2Idx, label2Idx)
valid_sentences, valid_labels = createMatrices(split_valid, word2Idx, label2Idx)
test_sentences, test_labels = createMatrices(split_test, word2Idx, label2Idx)

# print to check 

print("train_sentences, train_labels: ", train_sentences[0:5], train_labels[0:5])
print()
# print("valid_sentences, valid_labels: ", valid_sentences[0:5], valid_labels[0:5])
print()
# print("test_sentences, test_labels: ", test_sentences[0:5], test_labels[0:5])

train_sentences, train_labels:  [[108, 963, 396, 1004, 312, 700, 1449, 964, 1362, 1872, 1214, 294], [508, 312, 1683, 1214, 1401, 48, 1099, 1264, 1872, 1214, 1891, 48, 647, 1092, 1966, 1677, 964, 607, 1552, 1314, 1275], [258, 1966, 791, 1160, 319, 564, 931, 1538, 1075, 510, 1723, 1490, 236, 1774, 1811, 1257, 964, 1000, 1926, 1274, 1702, 48, 5, 396, 209, 312, 584, 1952, 1642, 964, 388, 1900, 312, 1141, 942], [45, 312, 396, 1952, 2, 312, 396, 1168, 700, 782, 586, 746, 48, 319, 477, 164, 1703, 1511, 1129, 1429], [953, 964, 931, 783, 1321]] [[0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 1, 5], [0, 1, 5, 0, 0, 0, 2, 0, 0, 0, 1, 5, 5, 1, 5, 5, 0, 1, 5, 0, 0, 0, 0, 0, 2, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 4, 0, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0], [0, 0, 2, 0, 0]]




In [81]:
### reshow some info of about the training set 

# about the label
print("idx2Label: ", len(idx2Label))
print("idx2Label: ", idx2Label)
print()

# the length of total word vocab
print("word2Idx: ", len(word2Idx))

idx2Label:  7
idx2Label:  {0: 'O', 1: 'B-DIS', 2: 'B-PER', 3: 'I-PER', 4: 'B-NUT', 5: 'I-DIS', 6: 'I-NUT'}

word2Idx:  2014


In [86]:
### resign the number for labels 

"""# turn to str 
train_labels = str(train_labels)
"""

for sentences in train_labels:
        for i, word in enumerate(sentences):
            word = str(word)
            if word == '5' or word == '7':
                sentence[i] = 10000
            if word == '1' or word == '6':
                sentence[i] = 100000
            if word == '2' or word == '4':
                sentence[i] = 1000000
            #if word == '3' or word == '':
            #    sentence[i] = 0 


train_labels   

IndexError: list assignment index out of range

In [15]:
# cannot assign int

# trying with 'tags'

for sentences in tags:
    for sentence in sentences:
        for i, word in enumerate(sentence):
            if word == 'B-NUT' or word == 'I-NUT':
                sentence[i] = 10000
            if word == 'B-DIS' or word == 'I-DIS':
                sentence[i] = 100000
            if word == 'B-PER' or word == 'I-PER':
                sentence[i] = 1000000
            if word == 'O' or word == '':
                sentence[i] = 0 
            #else: 
            #    word[i] = 0
            
# print new tags
print("new tags: ", tags[0:5])

# try format the tags, reduce a layer of list


NameError: name 'tags' is not defined

In [38]:
print("re_corpus: ", re_corpus[0:10])

re_corpus:  ['literature\n', 'linking\n', 'the\n', 'effects\n', 'of\n', 'prepartum\n', 'nutrition\n', 'and\n', 'subsequent\n', 'fertility\n']


In [None]:
### remove '\n'

import re

# re_corpus = str(re_corpus)
# re_corpus = re_corpus.strip().replace('\n', '')
re_corpus = re_corpus[0:10]

for i in re_corpus:
    r = i.strip()
    re_cor = ''.join(r)
    re_corpus += re_cor
    


In [None]:
print("re_corpus: ", re_corpus[0:100])

### Transfer the 'RE_Corpus' into sentences

In [1]:
import os

def split_text_label(filename):
    '''
    Reads a file named filename, extracts the text and the labels and stores
    them in an array.
     
    returns [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] 
    '''
    
    # open file
    f = open(filename)
    
    # initializing
    split_labeled_text = []
    sentence = []
    
    # processing line by line 
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\t":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        
        # split by tab
        splits = line.split('	')
        # rstrip: strip from the right 
        sentence.append([splits[0].rstrip("\n")])
        
        
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text

# change training set 
split_train = split_text_label(os.path.join("RE_Corpus.txt"))

In [2]:
# using itertools

import itertools

merged1 = list(itertools.chain(*split_train))
merged2 = list(itertools.chain(*merged1))



In [3]:
print("merged: ", merged2[0:100])

# the sentence 

merged3 = ' '.join(merged2)
merged3

merged:  ['literature', 'linking', 'the', 'effects', 'of', '<e1>', 'prepartum', 'nutrition', '</e1>', 'and', 'subsequent', '<e2>', 'fertility', '</e2>', 'is', 'scarce', '', 'most', 'of', 'what', 'is', 'suggested', 'to', 'optimize', 'future', 'fertility', 'is', 'related', 'to', 'relationships', 'between', '<e1>', 'metabolic', 'disorders', '</e1>', 'and', 'risk', 'for', '<e2>', 'delayed', 'conception', '</e2>', '', 'common', '<e1>', 'metabolic', 'problems', '</e1>', 'that', 'affect', 'early', 'postpartum', 'cows', 'such', 'as', 'retained', 'fetal', 'membranes', 'milk', 'fever', 'ketosis', 'and', 'displaced', 'abomasum', 'are', 'know', 'to', 'extend', 'the', 'period', 'of', '<e2>', 'negative', 'energy', 'balance', '</e2>', 'and', 'delay', 'resumption', 'of', 'ovarian', 'cycles', '', 'cows', 'fed', '<e1>', 'high', 'fermentable', 'energy', '</e1>', 'diets', 'prepartum', 'have', 'improved', 'energy', 'balance', 'reduced', 'concentrations', 'of', 'plasma']


'literature linking the effects of <e1> prepartum nutrition </e1> and subsequent <e2> fertility </e2> is scarce  most of what is suggested to optimize future fertility is related to relationships between <e1> metabolic disorders </e1> and risk for <e2> delayed conception </e2>  common <e1> metabolic problems </e1> that affect early postpartum cows such as retained fetal membranes milk fever ketosis and displaced abomasum are know to extend the period of <e2> negative energy balance </e2> and delay resumption of ovarian cycles  cows fed <e1> high fermentable energy </e1> diets prepartum have improved energy balance reduced concentrations of plasma <e2> nonesterified fatty acids </e2> and hydroxybutyrate and reduced triacylglycerol infiltration in the hepatic tissue  cows fed <e1> high fermentable energy </e1> diets prepartum have improved energy balance reduced concentrations of plasma nonesterified fatty acids and hydroxybutyrate and reduced <e2> triacylglycerol infiltration </e2> in t

In [4]:
# wirte to a list 

f = open("re_corpus_test3_3_3.txt", "a")
f.write(merged3)
f.close()