In [46]:
import re
import pandas as pd
import os
import numpy as np

def transIrregularWord(word):
    if not word:
        return ''
    pattern1 = "[^A-Za-z]*$" #punctuation at the end of sentence
    pattern2 = "^[^A-Za-z@#]*" #punctuation at the start of sentence
    word = re.sub(pattern2, "", re.sub(pattern1, "", word))
    pattern3 = '(.*)http(.?)://(.*)' # url
    pattern4 = '^[0-9]+.?[0-9]+$' # number
    if not word:
        return ''
#     elif word.__contains__('@'):
#         return 'person'
#     elif word.__contains__('#'):
#         return 'topic'
    elif re.match(r'(.*)http?://(.*)', word, re.M|re.I|re.S):    
        return 'links'
    elif re.match(pattern4, word, re.M|re.I):
        return 'number'
    else:
        return  word.lower()
def sentence2words(line):
    words = re.split('([,\n ]+)', line.strip() )
    words = list( filter(lambda s: len(s)>0, [transIrregularWord(word) for word in words]) )
    return words

def word2chars(word, charVocab):
    return [charVocab.get(char) for char in word]

def CSVFile2Dataset(filepath):
    print(filepath)
    df = pd.read_csv(filepath,encoding='latin-1')
    instances = [(line[-1], line[0]) for line in df.values]
    del df
    texts = [sentence2words(instance[0]) for instance in instances]
    labels = [instance[1] for instance in instances]
    return texts, labels

def load_data(data_dir, charVocab): # charVocab is a char-to-idx dictionary:{char:idx}
    texts, label = CSVFile2Dataset(data_dir)
    encode_data = [list(word2chars(word, charVocab) for word in sentence) for sentence in texts]
    return encode_data, label
    


In [47]:
dirpath = '/home/hadoop/trainingandtestdata'
trainfile = 'training.1600000.processed.noemoticon.csv'
testfile = 'testdata.manual.2009.06.14.csv'
s = ['t', '—', '*', 'f', '#', '(', '4', 'a', '”', '{', '!', 'm', 's', ':', 'n', 
     'k', 'z', '}', '@', ')', 'h', '8', '0', '/', 'u', 'o', 'x', '6', '“', 'e', 
     'p', 'i', 'b', '2', '&', ' ', "'", '$', 'r', 'l', '.', '`', '_', 'y', 'c', 
     'w', '?', '1', '~', ';', ']', '+', '^', '%', 'v', '9', 'g', 'q', 'j', '[',
     ',', 'd', '-']
c_voc = {c:idx for (idx, c) in enumerate(s)}

In [55]:
df = pd.read_csv(os.path.join(dirpath, testfile), encoding='latin-1')
instances = [(line[-1], line[0]) for line in df.values]

In [57]:
instances[1]

('Ok, first assesment of the #kindle2 ...it fucking rocks!!!', 4)

In [48]:
trainset, trainlabel = load_data(os.path.join(dirpath, testfile), c_voc)

/home/hadoop/trainingandtestdata/testdata.manual.2009.06.14.csv


In [60]:
len(trainset), trainset[0][0], [list(s[idx] for idx in trainset[1][i]) for i in range(len(trainset[1]))]

(497,
 [38, 29, 7, 61, 31, 14, 56],
 [['o', 'k'],
  ['f', 'i', 'r', 's', 't'],
  ['a', 's', 's', 'e', 's', 'm', 'e', 'n', 't'],
  ['o', 'f'],
  ['t', 'h', 'e'],
  ['#', 'k', 'i', 'n', 'd', 'l', 'e'],
  ['i', 't'],
  ['f', 'u', 'c', 'k', 'i', 'n', 'g'],
  ['r', 'o', 'c', 'k', 's']])

In [None]:

testset, testlabel = CSVFile2Dataset(os.path.join(dirpath, testfile))
max_sent_len = max([(len(sent) for sent in texts) for texts in [trainset, testset]])
print(max_sent_len)

In [None]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/Users/lumenglong/word2vec.model')

def GetTrainingBatch(batchId, batchsize, embedding_dim):
    data_x = np.zeros([data_x = np.zeros([batch, max_sent_len, embedding_dim], dtype=np.float32)
    data_y = np.zeros([batch_size, 3], dtype=np.int32)])
    startIdx = batchId*batchsize
    miss_vec = 0
    hit_vec = 0
    if startIdx >= len(trainset):
        startIdx = startIdx%len(trainset)
    for i in range(batchsize):
        mts = startIdx + i
        if mts >= len(trainset):
            mts = mts%len(trainset)
        data_y[i][trainlabel[mts]/2] = 1
        for j in range(len(trainset[mts])):
            try:
                data_x[i][j] = word2vec[trainset[mts][j]]
            except KeyError:
                print("word:", m_word)
                miss_vec += 1
            except IndexError:
                print("i, j, k:", FLAGS.batch_size, '|',t_data_len[mts] ,'|', len(t_words))
                print("word:", m_word, "(", i, j, k, ")")
                raise
            else:
                hit_vec += 1
    print("hit_vec | miss_vec:", hit_vec, '|', miss_vec)
    return data_x, data_y

def GetTestData(batchId, batchsize, embedding_dim):
    data_x = np.zeros([data_x = np.zeros([batchsize, max_sent_len, embedding_dim], dtype=np.float32)
    data_y = np.zeros([batch_size, 3], dtype=np.int32)])
    startIdx = batchId*batchsize
    miss_vec = 0
    hit_vec = 0
    if startIdx >= len(testset):
        startIdx = startIdx%len(testset)
    for i in range(batchsize):
        mts = startIdx + i
        if mts >= len(testset):
            mts = mts%len(testset)
        data_y[i][testlabel[mts]/2] = 1
        for j in range(len(testset[mts])):
            try:
                data_x[i][j] = word2vec[testset[mts][j]]
            except KeyError:
                print("word:", m_word)
                miss_vec += 1
            except IndexError:
                print("i, j, k:", batch_size, '|',t_data_len[mts] ,'|', len(t_words))
                print("word:", m_word, "(", i, j, k, ")")
                raise
            else:
                hit_vec += 1
    print("hit_vec | miss_vec:", hit_vec, '|', miss_vec)
    return data_x, data_y



In [8]:
s = "Defend the harmony of Yuen Long, protect local peace. ——6th village of Yuen Long. 1899, British army invaded Yuen Long, and went into the heavy resistance of local people. 1941, Japanese army invaded Yuen Long, and went into the local guerrillas 2014, when the rioters of occupying central invaded Yuen Long, they got kicked away by locals. 2019, the rioters wanted to take over Yuen Long again, and got defeated one more time. Yuen Long people are native HK people. Because of the unification, defeated invaders one time after the other. As a Yuen Long resident, I sincerely invite people living in other regions to move here. (I am not sure about the last sentence, since its Cantonese). #Yuen Long people are true HK people #I love Yuen Long. Protect Yuen Long, protect homeland. Put the fight aside, strive for HK. The hope of HK is in Yuen Long. Tonight, the Yuen Long people which are mainly hakka wearing white shirts, holding Chinese flags, pushed all the way to Admiralty, and had encountered the HK separatist who are mainly South Asian immigrant and wearing black shirts. Most black shirts ran away. Some of them were concentrated to the middle section, and avoid being beaten up by the local police. (my comment: If you remember those “protecters” surrounding police HQ) Earlier this day, those black shirts, by the support of foreign intelligence, assaulted LOCPG office and paint insulting words such as “chink” on the walls.!@qwertyuiop[]\asdfghjkl;'zxcvbnm,.?/{}+_-)(*&^%$#@!~`)"
s_set = list(set(s.lower()))

In [9]:
s_set

['t',
 '—',
 '*',
 'f',
 '#',
 '(',
 '4',
 'a',
 '”',
 '{',
 '!',
 'm',
 's',
 ':',
 'n',
 'k',
 'z',
 '}',
 '@',
 ')',
 'h',
 '8',
 '0',
 '/',
 'u',
 'o',
 'x',
 '6',
 '“',
 'e',
 'p',
 'i',
 'b',
 '2',
 '&',
 ' ',
 "'",
 '$',
 'r',
 '\x07',
 'l',
 '.',
 '`',
 '_',
 'y',
 'c',
 'w',
 '?',
 '1',
 '~',
 ';',
 ']',
 '+',
 '^',
 '%',
 'v',
 '9',
 'g',
 'q',
 'j',
 '[',
 ',',
 'd',
 '-']

In [10]:
s = ['t', '—', '*', 'f', '#', '(', '4', 'a', '”', '{', '!', 'm', 's', ':', 'n', 'k', 'z', '}', '@', ')', 'h', '8', '0', '/', 'u', 'o', 'x', '6', '“', 'e', 'p', 'i', 'b', '2', '&', ' ', "'", '$', 'r', 'l', '.', '`', '_', 'y', 'c', 'w', '?', '1', '~', ';', ']', '+', '^', '%', 'v', '9', 'g', 'q', 'j', '[', ',', 'd', '-']