In [1]:
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('wordnet')


remove_words = ["@handle", "RT", "http"]
lemmatizer = WordNetLemmatizer() 

def pre_process(sentence, max_length):
    sentence = sentence.split()
    target_remove = set()
    for i, token in enumerate(sentence):
        for target in remove_words:
            if (target == "http") and (target in token.lower()):
                target_remove.add(token)
                break
            if target in token:
                target_remove.add(token)
                break
    for target in target_remove:
        while target in sentence:
            sentence.remove(target)
            
    for i, token in enumerate(sentence):
        sentence[i] = lemmatizer.lemmatize(token)
        
    max_length = max(max_length, len(sentence))
    sentence = ' '.join(sentence)
    return sentence, max_length

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hanxunhuang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
import csv
import collections
import numpy as np

train_file_path = "data/train_tweets.txt"
train_dict = collections.defaultdict(list)
max_length = 0

length_array = []
with open(train_file_path, encoding='utf-8') as tsvfile:
    reader = tsvfile.readlines()
    for i, row in enumerate(reader):
        row = row.strip().split("\t")
        id = int(row[0])
        instance, max_length = pre_process(row[1], max_length)
        if not instance == "":
            train_dict[id].append(instance)
            length_array.append(len(instance.split()))
            if len(instance) == 0:
                print("Error")
    print("Total rows: %d" % i)
    
print("Total ids: %d" % len(train_dict))
print("Longest Sentence: %d" % (max_length))
print(len(train_dict.keys()))

Total rows: 328931
Total ids: 9295
Longest Sentence: 37
9295


In [43]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
group_keys = list(chunks(list(train_dict.keys()), 95))
# group_keys = group_keys[:len(group_keys)-1]
print(len(group_keys))

98


In [44]:
group_train_dict = collections.defaultdict(dict)

for id in train_dict:
    for group_key, group_key_list in enumerate(group_keys):
        if id in group_key_list:
            group_train_dict[group_key][id] = train_dict[id]


In [45]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

n_items = take(1, group_train_dict.items())
print(n_items)



In [47]:
# Build Class IDX Map

class_idx = {}

for group_key in group_train_dict:
    for indice, id in enumerate(group_train_dict[group_key]):
        class_idx[id] = (group_key, indice)
    
n_items = take(20, class_idx.items())
print(n_items)
print(len(class_idx))


[(8746, (0, 0)), (2423, (0, 1)), (564, (0, 2)), (3039, (0, 3)), (9661, (0, 4)), (1017, (0, 5)), (9976, (0, 6)), (7123, (0, 7)), (2764, (0, 8)), (578, (0, 9)), (5337, (0, 10)), (3815, (0, 11)), (1224, (0, 12)), (1732, (0, 13)), (841, (0, 14)), (5012, (0, 15)), (6312, (0, 16)), (7736, (0, 17)), (982, (0, 18)), (6005, (0, 19))]
9295


In [50]:
import random
import numpy as np

dev_split = 0.1
train_split = 1 - dev_split

dev_set_dict = {}
train_set_dict = {}

for group_key in group_train_dict:
    for id in group_train_dict[group_key]:
        target_list = train_dict[id]
        length = len(target_list)
        random.shuffle(target_list)
        split = int(np.ceil(length*dev_split))
        dev_set_dict[id] = target_list[:split]
        train_set_dict[id] = target_list[split:length]
#     print(len(dev_set_dict[id]), len(train_set_dict[id]), length)

print(len(dev_set_dict), len(train_set_dict), len(train_dict))
n_items = take(20, dev_set_dict.items())
print(n_items)

9295 9295 9295
[(8746, ['More Thanksgiving leftover idea', 'Good read: Harvard Business - Oprah Winfrey and Your Leadership Brand #leadership #brand #marketing', 'It is a litt;e boring working from home but when I travel oo la la! #anntaylor', 'I wa on phone w her while I wa watching . The insurance issue affect million like her', "She's just fine - excited abt GMA and all the good thing that are happening. Nothing to worry about!", 'Hi there! Been traveling a lot and lot more to come this and next month - recovered from the pneumonia - but well! How r u?', 'Help u help MusiCares! Vote for Charity on Facebook:', 'only 1!!!!!', "If you do Mary Ellen - so will I! Think I am going to go back to bed a I don't have to be anywhere til 1:00", 'D- Thanks for the you going to WOMMA Summit?']), (2423, ['BREAKING NEWS: A Santa Cruz Mom Found a Secret to Turn Yellow Teeth White at Home. Read the story', "SECRET: White Teeth Trick! Dentists don't want you to know about THIS teeth whitening secret!"

In [52]:
# Save to file
import pickle
import pandas as pd

def save_to_file(target_dict, file_path):
    id_list = []
    sentence_list = []
    for id in target_dict:
        for sentence in target_dict[id]:
            id_list.append(id)
            sentence_list.append(sentence)
      
    id_list = np.array(id_list)
    sentence_list = np.array(sentence_list)
    random_index = np.array(range(len(sentence_list)))
    random.shuffle(random_index)
    id_list = id_list[random_index]
    sentence_list = sentence_list[random_index]
    
    dataframe = pd.DataFrame({'id':id_list,'sentence':sentence_list})
    dataframe.to_csv(file_path,index=False,sep='\t',header=None)
    print(len(id_list))
    return

dev_set_path = 'data/v3/dev_set_v1.txt'
train_set_path = 'data/v3/train_set_v1.txt'
idx_file_path = 'data/v3/v1_idx.pickle'

save_to_file(dev_set_dict, dev_set_path)
save_to_file(train_set_dict, train_set_path)

with open(idx_file_path, 'wb') as handle:
    pickle.dump(class_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

36116
291758


In [53]:
print("GroupCount %d" % len(group_keys))
print("ClassCount %d" % 95)

GroupCount 98
ClassCount 95
