In [29]:
import re
import pandas as pd
import os
import numpy as np
import keras

class SentiDataLoader:
    def __init__(self, dirpath, trainfile, testfile, char_list):
        self.dirpath = dirpath
        self.trainfile = trainfile
        self.testfile = testfile
        self.s = char_list

    def load_data(self): # charVocab is a char-to-idx dictionary:{char:idx}
        def transIrregularWord(word):
            if not word:
                return ''
            pattern1 = "[^A-Za-z]*$" #punctuation at the end of sentence
            pattern2 = "^[^A-Za-z@#]*" #punctuation at the start of sentence
            word = re.sub(pattern2, "", re.sub(pattern1, "", word))
            pattern3 = '(.*)http(.?)://(.*)' # url
            pattern4 = '^[0-9]+.?[0-9]+$' # number
            if not word:
                return ''
        #     elif word.__contains__('@'):
        #         return 'person'
            elif word.__contains__('#'):
                return 'topic'
            elif re.match(r'(.*)http?://(.*)', word, re.M|re.I|re.S):    
                return 'links'
#             elif re.match(pattern4, word, re.M|re.I):
#                 print("word:", word)
#                 return 'number'
            else:
                return  word.lower()

        def sentence2words(line):
            words = re.split('([,\n ]+)', line.strip() )
            words = list( filter(lambda s: len(s)>0, [transIrregularWord(word) for word in words]) )
            return words

        def word2chars(word, charVocab):
            rst = [charVocab.get(char) for char in word]
            for i in range(len(rst)):
                if rst[i] is None:
                    print("Unknown char:", word[i])
                    print("word:", word)
                    rst[i] = 0
            return rst

        def CSVFile2Dataset(filepath):
            print(filepath)
            df = pd.read_csv(filepath,encoding='latin-1')
            instances = [(line[-1], line[0]) for line in df.values]
            del df
            texts = [sentence2words(instance[0]) for instance in instances]
            labels = [instance[1] for instance in instances]
            return texts, labels
        
        charVocab = {c:idx for (idx, c) in enumerate(self.s)}
        texts, self.train_label = CSVFile2Dataset(os.path.join(self.dirpath, self.trainfile))
        self.train_data = [list(word2chars(word, charVocab) for word in sentence) for sentence in texts]
        texts, self.test_label = CSVFile2Dataset(os.path.join(self.dirpath, self.testfile))
        self.test_data = [list(word2chars(word, charVocab) for word in sentence) for sentence in texts]
        del texts
        self.max_sent_len = max([max(len(sent) for sent in texts) for texts in [self.train_data, self.test_data]])
        self.max_char_num = max(
                                max([max(len(word) for word in sent) for sent in self.test_data]), 
                                max([max(len(word) for word in sent) for sent in self.train_data])
                               )

    def GetTrainingBatch(self, batchId, batchsize):
        def padding_sequence(max_len, sentence):
            return keras.preprocessing.sequence.pad_sequences(
                                                        sentence, 
                                                        maxlen=max_len, 
                                                        dtype='int32', 
                                                        padding='post', 
                                                        truncating='post',
                                                        value=0.0
            )
        
        startIdx = batchId*batchsize
        data_y = np.zeros([batchsize, 3], dtype=np.int32)
        ids = [idx%len(self.train_data) for idx in range(startIdx, startIdx+batchsize, 1)]
        data_x = np.array(
                        [padding_sequence(41, self.train_data[x]).tolist() 
                        for x in ids]
        )
        for idx in ids:
            data_y[idx][int(self.train_label[idx]/2)] = 1
        return data_x, data_y
    
    def GetTestData(self, batchId, batchsize):
        def padding_sequence(max_len, sentence):
            return keras.preprocessing.sequence.pad_sequences(
                                                        sentence, 
                                                        maxlen=max_len, 
                                                        dtype='int32', 
                                                        padding='post', 
                                                        truncating='post',
                                                        value=0.0
            )
        
        startIdx = batchId*batchsize
        data_y = np.zeros([batchsize, 3], dtype=np.int32)
        ids = [idx%len(self.test_data) for idx in range(startIdx, startIdx+batchsize, 1)]
        data_x = np.array(
                        [padding_sequence(41, self.test_data[x]).tolist() 
                        for x in ids]
        )
        for idx in ids:
            data_y[idx][int(self.test_label[idx]/2)] = 1
        return data_x, data_y

In [30]:
# dirpath = '/home/hadoop/trainingandtestdata'
dirpath = '/Users/lumenglong/Downloads/trainingandtestdata'
# trainfile = 'training.1600000.processed.noemoticon.csv'
testfile = 'testdata.manual.2009.06.14.csv'
trainfile = 'testdata.manual.2009.06.14.csv'
s = ['UNK', 't', '—', '*', 'f', '#', '(', 'a', '”', '{', '!', 'm', 's', ':', 'n', 
     'k', 'z', '}', '@', ')', 'h', '/', 'u', 'o', 'x', '“', 'e', 
     'p', 'i', 'b', '&', ' ', "'", '$', 'r', 'l', '.', '`', '_', 'y', 'c', 
     'w', '?', '~', ';', ']', '+', '^', '%', 'v', 'g', 'q', 'j', '[',
     ',', 'd', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
loader = SentiDataLoader(dirpath, trainfile, testfile, s)

In [31]:
loader.load_data()

/Users/lumenglong/Downloads/trainingandtestdata/testdata.manual.2009.06.14.csv
/Users/lumenglong/Downloads/trainingandtestdata/testdata.manual.2009.06.14.csv


In [26]:
# test_X = [list(list(range(num)) for num in range(5,10,1)) for i in range(3)]
loader.GetTrainingBatch(0, 25)

(array([list([[34, 26, 7, 55, 28, 14, 50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [11, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [15, 28, 14, 55, 35, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [35, 23, 49, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [28, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [35, 26, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [40, 20, 28, 35, 55, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [28, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [32]:
char_num_cnt = {(i+1):0 for i in range(41)}
trainset = loader.train_data
testset = loader.test_data
max_char_num = 0
for data in [trainset, testset]:
    for sent in data:
        for word in sent:
            if max_char_num < len(word):
                max_char_num = len(word)
                if max_char_num > 21:
                    try:
                        print(''.join([s[idx] for idx in word]))
                    except TypeError:
                        print(word)
                        raise
                    print("sent:", [''.join([s[idx] for idx in w]) for w in sent])

www.tinyurl.com/m595fk
sent: ['time', 'warner', 'cable', 'pulls', 'the', 'plug', 'on', 'the', 'girlfriend', 'experience', 'www.tinyurl.com/m595fk']


In [33]:
max_char_num

22