In [4]:
import numpy as np
import pandas as pd
import math
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from scipy.cluster.vq import whiten

def get_tf_idf_query_similarity(docs_tfidf, query_tfidf):
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

# def l2_norm(a):
#     return math.sqrt(np.dot(a, a))

# def cosine_similarity(a, b):
#     return np.dot(a,b) / (l2_norm(a) * l2_norm(b))

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
punctuation_str = string.punctuation
symbol_emoji_list = [":)",  ";)",  ":(",  ":\\",  ":|", ":]", ":[",
                     ":-)", ";-)", ":-(", ":-\\", ":-|", ":-[", ";-]",
                     ":D", ":P", ":-x", ":'-(", ":_(", ":o)", "XD", ":'(", ":->",
                     "o_O", "T_T", "^o^", 
                     ":-D", ":-P","B-)", "8-)", ":-o", ":-O", ":-0", ":-s", ":-S"]
# TODO hashtag #
# TODO hmmmmm hhuuuuugg

In [35]:
version = "v1_4"
version_name = "{0}".format(version)
dev_set_path = 'data/{0}/dev_set_{1}.txt'.format(version, version_name)
train_set_path = 'data/{0}/train_set_{1}.txt'.format(version, version_name)

train_data = pd.read_csv(train_set_path, sep='\t', header=None)
dev_data = pd.read_csv(dev_set_path, sep='\t', header=None)
train_label = (np.array(train_data)[:,0]).astype('int')
dev_label = (np.array(dev_data)[:,0]).astype('int')
train_sentence_list = train_data[1]
dev_sentence_list = dev_data[1]

In [36]:
%%time
from nltk import word_tokenize

train_sentence_tokenized = train_sentence_list.apply(word_tokenize)

CPU times: user 34.4 s, sys: 88.2 ms, total: 34.5 s
Wall time: 34.5 s


In [49]:
%%time
all_list = []
for index, value in train_sentence_tokenized.iteritems():
    all_list += value
corpus = set(all_list)

CPU times: user 275 ms, sys: 12 ms, total: 287 ms
Wall time: 286 ms


In [58]:
corpus_dict = dict(zip(corpus, range(len(corpus))))

In [73]:
# 建立句子的向量表示
def vector_rep(text, corpus_dict):
    vec = []
    for key in corpus_dict.keys():
        if key in text:
            vec.append((corpus_dict[key], text.count(key)))
        else:
            vec.append((corpus_dict[key], 0))

    vec = sorted(vec, key= lambda x: x[0])
    return vec

from math import sqrt
def similarity_with_2_sents(vec1, vec2):
    inner_product = 0
    square_length_vec1 = 0
    square_length_vec2 = 0
    for tup1, tup2 in zip(vec1, vec2):
        inner_product += tup1[1]*tup2[1]
        square_length_vec1 += tup1[1]**2
        square_length_vec2 += tup2[1]**2

    return (inner_product/sqrt(square_length_vec1*square_length_vec2))

CPU times: user 0 ns, sys: 4 µs, total: 4 µs
Wall time: 268 µs


In [75]:
%%time
for index, value in dev_sentence_list.iteritems():
    dev_vec = vector_rep(value, corpus_dict)
    sim_array = np.zeros((len(dev_sentence_list),1))
    for _, train_value in train_sentence_list.iteritems():
        train_vec = vector_rep(train_value, corpus_dict)
        sim_array[index, 0] = similarity_with_2_sents(dev_vec, train_vec)
    break

KeyboardInterrupt: 

In [None]:
from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

token_dict = {}
for sentence in train_sentence_list:
    

In [None]:
def removeStopWords(word_tokens):
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stops: 
            filtered_sentence.append(w)
    return filtered_sentence

In [252]:
# get part of speech for each token in each chapter
def token_to_pos(ch):
    tokens = nltk.word_tokenize(ch)
    return [p[1] for p in nltk.pos_tag(tokens)]

def featureExtractor(sentence_data):
    NUM_TOP_WORDS = 10
    pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
    
    num_users = len(sentence_data)
    # fvs stands for feature vectors
    fvs_lexical = np.zeros((num_users, 3), np.float64)
    fvs_punct = np.zeros((num_users, len(punctuation_str)), np.float64)
    fvs_symbol_emoji = np.zeros((num_users, len(symbol_emoji_list)), np.float64)
    fvs_bow = []
#     fvs_syntax = []
    
    for i, sentense_txt in enumerate(sentence_data):
        # note: the nltk.word_tokenize includes punctuation
        all_text = " ".join(sentense_txt)
        tokens = nltk.word_tokenize(all_text.lower())
        words = word_tokenizer.tokenize(all_text.lower())
        vocab = set(words)
    
        words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                       for s in sentense_txt])
        # average number of words per sentence
        fvs_lexical[i, 0] = words_per_sentence.mean()
        # sentence length variation
        fvs_lexical[i, 1] = words_per_sentence.std()
        # Lexical diversity
        fvs_lexical[i, 2] = len(vocab) / (float(len(words))+1)
 
        for j, char in enumerate(punctuation_str):
            fvs_punct[i, j] = tokens.count(char) / (float(len(sentense_txt))+1)
        
        for j, emoji in enumerate(symbol_emoji_list):
            fvs_symbol_emoji[i, j] = tokens.count(emoji) / (float(len(sentense_txt))+1)

    fvs_lexical = np.nan_to_num(fvs_lexical)
    fvs_punct = np.nan_to_num(fvs_punct)
    fvs_lexical = whiten(fvs_lexical)
    fvs_punct = whiten(fvs_punct)
    return (fvs_lexical, fvs_punct, fvs_symbol_emoji)

In [258]:
def getTFIDF(sentence_data):
    #sentense_list = sentence_data
    tfidf_list = []
    mat_list = []
    for sentense_txt in sentence_data:
        # note: the nltk.word_tokenize includes punctuation
        # all_text = " ".join(sentense_txt)
        #sentense_txt = list(sentense_txt)
        if 0 == len(sentense_txt):
            tfidf_list.append(None)
            mat_list.append(None)
            break
        vectorizer = TfidfVectorizer(lowercase=False, stop_words=None)
        X = vectorizer.fit(sentense_txt)
        mat = vectorizer.fit_transform(sentense_txt)
        tfidf_list.append(X)
        mat_list.append(mat)
    return (tfidf_list, mat_list)

In [265]:
tfidf_list, mat_list = getTFIDF(train_data['sentence'])

In [294]:
def makePrediction(sentence):
    prediction = np.zeros((len(train_label),1))
    for i, tfidf in enumerate(tfidf_list):
        X, mat = tfidf, mat_list[i]
        if (X == None) or (mat == None):
            prediction[i] = -1
            break
        temp_mat = X.transform(sentence)
        sim = get_tf_idf_query_similarity(mat, temp_mat)
        prediction[i] = max(sim)
    return prediction

In [295]:
prediction = np.zeros((len(dev_label),1))
for i, sentence in enumerate(dev_data['sentence']):
    sim = makePrediction(sentence)
    prediction[i] = train_label[np.argmax(max(sim))]

In [254]:
%%time
dev_feature = featureExtractor(dev_data['sentence'])

CPU times: user 6.15 s, sys: 41.4 ms, total: 6.19 s
Wall time: 6.27 s




In [255]:
%%time
train_feature = featureExtractor(train_data['sentence'])

  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


CPU times: user 36.3 s, sys: 72 ms, total: 36.4 s
Wall time: 36.3 s


In [256]:
# TODO PCA, TF-IDF
k=13

train_lexical, train_punct, train_symbol_emoji = train_feature
dev_lexical, dev_punct, dev_symbol_emoji = dev_feature

train_fit_data = np.concatenate((train_lexical, train_punct, train_symbol_emoji), axis=1)
dev_fit_data = np.concatenate((dev_lexical, dev_punct, dev_symbol_emoji), axis=1)

knn_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
knn_clf.fit(train_fit_data, train_label)

# lexical_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
# punct_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
# lexical_clf.fit(train_lexical, train_label)
# punct_clf.fit(train_punct, train_label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=13, p=2,
                     weights='distance')

In [257]:
%%time
predicted = knn_clf.predict(dev_fit_data)
accuracy = sum(predicted == dev_label)/len(predicted)
print(accuracy)

0.010112963959117805
CPU times: user 12.7 s, sys: 52.4 ms, total: 12.7 s
Wall time: 4.34 s


In [None]:
# tf idf


In [109]:
%%time
#lexical_knn = train_label[lexical_clf.kneighbors(dev_lexical, return_distance=False)]
lexical_predicted = lexical_clf.predict(dev_lexical)
accuracy = sum(lexical_predicted == dev_label)/len(lexical_predicted)
print(accuracy)

0.00021516944593867672
CPU times: user 1.21 s, sys: 23 ms, total: 1.23 s
Wall time: 1.03 s


In [114]:
%%time
#punct_knn = train_label[punct_clf.kneighbors(dev_punct, return_distance=False)]
punct_predicted = punct_clf.predict(dev_punct)
#accuracy = sum(punct_predicted == dev_label)/len(punct_predicted)
accuracy = sum(punct_predicted == dev_label)
print(accuracy)

159
CPU times: user 8.71 s, sys: 50.2 ms, total: 8.76 s
Wall time: 4.08 s


In [97]:
predicted = np.concatenate((lexical_knn, punct_knn),axis=1)

In [107]:
predicted_list = np.zeros((len(dev_label),1))
counter = 0
for i, candidate in enumerate(predicted):
    if dev_label[i] in candidate:
        counter += 1
    predicted_list[i] = np.argmax(np.bincount(candidate))
accuracy = sum(predicted_list.astype('int') == np.array(dev_label)) / len(dev_label)
print(accuracy)

[0. 0. 0. ... 0. 0. 0.]


In [108]:
fvs_bow.

9295