In [None]:
# -*- coding: utf-8 -*-

import numpy as np
import pandas as pd 
import itertools
from collections import Counter
from pyvi.pyvi import ViTokenizer, ViPosTagger
import utils
import unicodecsv as csv
import sys
import json

SOURCE_DIR = './../../crawl_info/user_post/'

def tokenize(posts_str):
    posts = posts_str.split('(^-^)')
    result = []
    for p in posts:
        clean_str = utils.process(p) #.encode('utf-8')
        result.extend(ViTokenizer.tokenize(clean_str).split(' '))
    return result

def data_load():
    train_df = pd.read_csv(SOURCE_DIR + 'post_user_from_04_2017.csv')
    train_df = train_df.dropna()
    train_df = utils.add_age_category_to_df(train_df)
    train_df['post'] = map(lambda x: x.decode('utf-8'), train_df['post'])
    train_df.to_csv('./sample.csv', index=None, encoding = 'utf-8')
    print "Done data loading"

def text_and_labels():
    df = pd.read_pickle('./tokenized_sample.pkl')
    df = df.dropna()
    X = df.loc[:, 'userposts']
    y = df.loc[:, 'age']
    print "Done data loading"
    return X, y

def save_data_and_labels(t_sentences, labels):
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    #data_load()
    df = pd.DataFrame(
    {'userposts': t_sentences,
     'age': labels
    })
    #pickle.dump(df, open('./CarDataset/original_dataset.pkl', 'wb'))
    df.to_pickle('tokenized_sample.pkl')

def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    data_load()
    df = pd.read_csv('./sample.csv')
    x_text = df.loc[:, 'post']
    y_cat = df.loc[:, 'age_category']
    x_text = map(lambda x: x.decode('utf-8'), x_text)
    
    # Split by words
    X = [tokenize(user_posts) for user_posts in x_text]
    print "Done tokenizing Viet users' posts"
    cat = ["A", "B", "C", "D"] 
    y = []
    for i in range(len(y_cat)):
        c = y_cat[i]
        if c == cat[0]:
            y.append([1, 0, 0, 0])
        elif c == cat[1]:
            y.append([0, 1, 0, 0])
        elif c == cat[2]:
            y.append([0, 0, 1, 0])
        else:
            y.append([0, 0, 0, 1])
    #print X[0] 
    save_data_and_labels(X, y)
    # Generate labels
    return [X, y]


def pad_sentences(sentences, labels, padding_word="<PAD/>", min_length=200, max_length=800):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    filtered_sentences = dict()
    idxes = []
    idx = 0
    for s in sentences:
        if len(s) > min_length:
            if len(s) > max_length:
                #print("Index: %s" % idx)
                composite_list = [s[x:x+max_length] for x in range(0, len(s),max_length) if len(s[x:x+max_length]) > 200]
                filtered_sentences[idx] = (len(composite_list), composite_list)
            else:
                filtered_sentences[idx] = (1, [s])
            idxes.append(idx)
        idx += 1
    print idxes
    #print("There are %s padded sentences!" % len(filtered_sentences.keys()))
    #sequence_length = max(len(x) for values in filtered_sentences.values())
    new_labels = []
    original_sentences = []
    for i in idxes:
        num = filtered_sentences[i][0]
        original_sentences += filtered_sentences[i][1]
        new_labels += num * [labels[i]]
    
    padded_sentences = []
    sequence_length = max(len(x) for x in original_sentences)
    for i in range(len(original_sentences)):
        sentence = original_sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    
    print("There are %s padded sentences" % len(padded_sentences))
    return [padded_sentences, new_labels]


def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv_list = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv_list)}
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
    with open('vocabularies.json', 'w') as fv:
        json.dump(vocabulary, fv)
    with open('vocabularies_inv.json', 'w') as fvi:
        json.dump(vocabulary_inv, fvi)
    return [vocabulary, vocabulary_inv]


def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentencs and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    print len(x)
    print len(y)
    np.save('/data_x_new_200_800.npy', x)
    np.save('/data_y_new_200_800.npy', y)
    return [x, y]


def preprocess_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = text_and_labels()
    print len(sentences)
    print "finish loading tokenized sentences and labels"
    sentences_padded, filtered_labels = pad_sentences(sentences, labels)
    #print len(sentences_padded)
    #filtered_labels = labels[idxes]
    print "finish padding"
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    '''
    with open('vocabularies.json', 'r') as fv:
        vocabulary = json.load(fv)
    with open('vocabularies_inv.json', 'r') as fvi:
        vocabulary_inv = json.load(fvi)
    '''
    print "finish building vocabularies"
    x, y = build_input_data(sentences_padded, filtered_labels, vocabulary)
    print "finish mapping sentences and labels to vectors based on vocabularies"
    return [x, y, vocabulary, vocabulary_inv]


def batch_iter(data, batch_size, num_epochs):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data) / batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        shuffle_indices = np.random.permutation(np.arange(data_size))
        shuffled_data = data[shuffle_indices]
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [None]:
load_data()
with open('age_data.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for r in results:
        writer.writerow([r])

In [None]:
def checking_file(filename):
    train_df = pd.read_pickle(filename)
    #train_df = train_df.dropna()
    #train_df['post'] = map(lambda x: x.decode('utf-8'), train_df['userposts'])
    t_sentences = train_df.loc[:, 'userposts']
    max_len = -1    
    min_len = sys.maxint    
    length = 0     
    for ts in t_sentences:        
        l = len(ts)
        if l > max_len:
            max_len = l
        if l < min_len:
            min_len = l 
        length += l        
    avg_len = length/len(t_sentences)       
    print (min_len, avg_len, max_len)
    return t_sentences

In [None]:
#data_load()
import sys
df = pd.read_csv(SOURCE_DIR + 'post_user_from_04_2017.csv')
print len(df.index)
#load_data()
checking_file()

In [None]:
checking_file()
'''
with open("./output_sample.csv",'wb') as resultFile:
    wr = csv.writer(resultFile, dialect='excel', encoding='utf-8')
    wr.writerows(results)
'''

In [None]:
ViTokenizer.tokenize(u"Trường đại học bách khoa hà nội")

In [None]:
l = []
string = u"Trường đại học bách khoa hà nội 01650833798"
l.extend(ViTokenizer.tokenize(string).split(' '))

In [None]:
print l
print tokenize(string)[1]

In [None]:
x_text = clean_str(u"Trường đại học bách khoa hà nội 0910833798")
#x_text = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", x_text)
#X = [s.split(" ") for s in x_text]
print X
#print X

In [None]:
x_text = [u"Trường đại học bách khoa hà nội 0910833798", u"Trường đại học bách khoa hà nội 0910833798"]
x_text = [s.split(" ") for s in x_text]

In [None]:
print x_text

In [None]:
preprocess_data()

In [None]:
X, y = text_and_labels()
for i in range(3):
    print (X[i], y[i])

In [None]:
print X[0][4]

In [None]:
preprocess_data()

In [None]:
load_data_and_labels()
print "finish loading tokenized sentences and labels"




In [None]:
t_sentences = checking_file('./tokenized_original.pkl')
t_sentences_s = checking_file('./tokenized_sample.pkl')

In [None]:
small_files = [f for f in t_sentences_s if len(f) > 100]
avg_files = [f for f in t_sentences_s if len(f) > 500]
large_files = [f for f in t_sentences_s if len(f) > 150 and len(f) < 4000]
print "# of Files with small sizes: %s" % len(small_files)
print "# of Files with average sizes: %s" % len(avg_files)
print "# of Files with large sizes: %s" % len(large_files)

In [None]:
'''
def pad_sentences(sentences, padding_word="<PAD/>", min_length=100, max_length=4000):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    filtered_sentences = []
    idxes = []
    idx = 0
    for s in sentences:
        if len(s) > min_length:
            if len(s) > max_length:
                filtered_sentences.append(s[:4000])
            else:
                filtered_sentences.append(s)
            idxes.append(idx)
        idx += 1
    print("There are %s padded sentences!" % len(filtered_sentences))
    sequence_length = max(len(x) for x in filtered_sentences)
    padded_sentences = []
    for i in range(len(filtered_sentences)):
        sentence = filtered_sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences, idxes
'''

In [None]:
sentences, labels = text_and_labels()
print len(sentences)
print "finish loading tokenized sentences and labels"
filtered_sentences, sentences_padded, filtered_labels = pad_sentences(sentences, labels)
   #print len(sentences_padded)
    #filtered_labels = labels[idxes]
print "finish padding"
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
'''
    with open('vocabularies.json', 'r') as fv:
        vocabulary = json.load(fv)
    with open('vocabularies_inv.json', 'r') as fvi:
        vocabulary_inv = json.load(fvi)
'''
print "finish building vocabularies"
x, y = build_input_data(sentences_padded, filtered_labels, vocabulary)
print "finish mapping sentences and labels to vectors based on vocabularies"
#return [x, y, vocabulary, vocabulary_inv]

In [None]:
idx = 0
for i in sentences_padded:
    if idx < 4:    
        print i
    else:
        break
    idx += 1

In [None]:
print(sentences_padded[5])

In [None]:
print filtered_sentences[5][1][1]

In [None]:
a = itertools.chain(*sentences_padded)
for i in a:
    print i

In [None]:
Counter([1, 3, 4, 5, 3, 1])
print(type(list(list())))

In [None]:
print a[4]

In [None]:

for i in a:
    if type(i) == type(list()):
        print i