In [19]:
import numpy as np
import pandas as pd
import math
import string
import pickle
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from scipy.cluster.vq import whiten

def get_tf_idf_query_similarity(docs_tfidf, query_tfidf):
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

# def l2_norm(a):
#     return math.sqrt(np.dot(a, a))

# def cosine_similarity(a, b):
#     return np.dot(a,b) / (l2_norm(a) * l2_norm(b))

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

ascii_digits = '0123456789'
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
stops = stopwords.words('english')
punctuation_str = string.punctuation
punctuation_str_no_quote = punctuation_str.replace("\'", "")
symbol_emoji_list = [":)",  ";)",  ":(",  ":\\",  ":|", ":]", ":[",
                     ":-)", ";-)", ":-(", ":-\\", ":-|", ":-[", ";-]",
                     ":D", ":P", ":-x", ":'-(", ":_(", ":o)", "XD", ":'(", ":->",
                     "o_O", "T_T", "^o^", 
                     ":-D", ":-P","B-)", "8-)", ":-o", ":-O", ":-0", ":-s", ":-S"]

# TODO hashtag #
# TODO hmmmmm hhuuuuugg

In [60]:
version = "v2_1"
dev_set_path = '../data/{0}/dev_set_{0}.txt'.format(version)
train_set_path = '../data/{0}/train_set_{0}.txt'.format(version)

train_file = open(train_set_path, 'rb')
train_data = pickle.load(train_file)
dev_data = pd.read_csv(dev_set_path, header=None, sep='\t')
train_label = np.fromiter(train_data.keys(), dtype=int)
dev_label = (np.array(dev_data)[:,0]).astype('int')
#train_sentence_list = train_data[1]
dev_sentence_list = dev_data[1]

In [175]:
def getTrainFeature(train_data):
    """
    return:
        (m, 287)
        [avg_sentence_length(1), digit_percentage(1),       upper_case_percentage(1),   digit_frequency(10),
         letter_frequency(26),   punctuation_frequency(32), symbol_emoji_frequency(35), function_words_frequency(179),
         avg_word_len(1),        short_word_ratio(1)]
    """
    train_len = len(train_data)
    train_avg_sentence_length = np.zeros((train_len, 1))
    train_sentence_num = np.zeros((train_len, 1))
    
    train_digit_percentage = np.zeros((train_len, 1))
    train_upper_case_percentage = np.zeros((train_len, 1))
    
    train_digit_frequency = np.zeros((train_len, len(ascii_digits)))
    train_letter_frequency = np.zeros((train_len, len(ascii_lowercase)))
    
    # punctuation_str = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    # symbol_emoji_list
    train_punctuation_occurrence = np.zeros((train_len, len(punctuation_str)))
    train_symbol_emoji_occurence = np.zeros((train_len, len(symbol_emoji_list)))
    
    train_function_words_frequency = np.zeros((train_len, len(stops)))
    
    train_avg_word_len = np.zeros((train_len, 1))
    
    train_short_word_ratio = np.zeros((train_len, 1))
    
    # train_vocab_richness = np.zeros((train_len, 2))
    
    train_content_words_dict = {}
    
    counter = 0
    for key,values in train_data.items():
        sentence_num = len(values)
        whole_str = " ".join(values)
        
        char_num = len(whole_str) - sentence_num + 1
        train_sentence_num[counter] = sentence_num
        train_avg_sentence_length[counter] = char_num/sentence_num
        
        upper_case_counter = 0
        digit_counter = 0
        for char in whole_str:
            if char.isupper():
                upper_case_counter += 1
                continue
            if char.isdigit():
                digit_counter += 1
                continue
        train_digit_percentage[counter] = upper_case_counter/char_num
        train_upper_case_percentage[counter] = digit_counter/char_num
        
        for index, char in enumerate(ascii_digits):
            train_letter_frequency[counter, index] = whole_str.count(char) / char_num
        
        whole_str_lower = whole_str.lower()
        for index, char in enumerate(ascii_lowercase):
            train_letter_frequency[counter, index] = whole_str_lower.count(char) / char_num
        
        for index, punctuation in enumerate(punctuation_str):
            train_punctuation_occurrence[counter, index] = whole_str.count(punctuation)
        for index, symbol_emoji in enumerate(symbol_emoji_list):
            train_symbol_emoji_occurence[counter, index] = whole_str.count(symbol_emoji)
        
        whole_str_lower_no_punctuation = whole_str_lower
        for index, punctuation in enumerate(punctuation_str_no_quote):
            if punctuation in whole_str_lower_no_punctuation:
                whole_str_lower_no_punctuation = whole_str_lower_no_punctuation.replace(punctuation, " ")
        
        word_list = whole_str_lower_no_punctuation.split()
        word_num = len(word_list)
        if 0 == word_num:
            train_function_words_frequency[counter, index] = 0
            train_avg_word_len[counter, 0] = 0
            train_short_word_ratio[counter, 0] = 0
            train_content_words_list.append([])
            continue
        
        word_list_no_stop_word = whole_str_lower_no_punctuation.split()
        for index, stop_word in enumerate(stops):
            train_function_words_frequency[counter, index] = whole_str_lower_no_punctuation.count(stop_word) / word_num
            while stop_word in word_list_no_stop_word:
                word_list_no_stop_word.remove(stop_word)
        
        word_len = 0
        short_word_num = 0
        for word in word_list:
            word_len_temp = len(word)
            word_len += word_len_temp
            if word_len_temp < 4:
                short_word_num += 1
        train_avg_word_len[counter, 0] = word_len / word_num
        train_short_word_ratio[counter, 0] = short_word_num / word_num
        

        train_content_words_dict[key] = " ".join(word_list_no_stop_word)

    #     hapaxes = filter(lambda x: word_list.count(x) == 1, word_list)
    #     hapaxes_ratio = len(hapaxes) / word_num
    #     train_vocab_richness[counter, 0] = hapaxes_ratio
    #     train_vocab_richness[counter, 1] = 1 - hapaxes_ratio
        
        counter += 1
        
    train_punctuation_frequency = train_punctuation_occurrence / train_sentence_num
    train_symbol_emoji_frequency = train_symbol_emoji_occurence / train_sentence_num
    feature_mat = np.concatenate((train_avg_sentence_length, train_digit_percentage, train_upper_case_percentage,
                                  train_digit_frequency, train_letter_frequency, train_punctuation_frequency,
                                  train_symbol_emoji_frequency, train_function_words_frequency,
                                  train_avg_word_len, train_short_word_ratio), axis=1)
    assert(feature_mat.shape == (train_len, 287))
    return (feature_mat, train_content_words_dict)


In [176]:
def getDevFeature(dev_sentence_list):
    """
    return:
        (m, 287)
        [avg_sentence_length(1), digit_percentage(1),       upper_case_percentage(1),   digit_frequency(10),
         letter_frequency(26),   punctuation_frequency(32), symbol_emoji_frequency(35), function_words_frequency(179),
         avg_word_len(1),        short_word_ratio(1)]
    """
    dev_len = len(dev_sentence_list)
    dev_avg_sentence_length = np.zeros((dev_len, 1))
    
    dev_digit_percentage = np.zeros((dev_len, 1))
    dev_upper_case_percentage = np.zeros((dev_len, 1))
    
    dev_digit_frequency = np.zeros((dev_len, len(ascii_digits)))
    dev_letter_frequency = np.zeros((dev_len, len(ascii_lowercase)))
    
    # punctuation_str = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    # symbol_emoji_list
    dev_punctuation_frequency = np.zeros((dev_len, len(punctuation_str)))
    dev_symbol_emoji_frequency = np.zeros((dev_len, len(symbol_emoji_list)))
    
    dev_function_words_frequency = np.zeros((dev_len, len(stops)))
    
    dev_avg_word_len = np.zeros((dev_len, 1))
    
    dev_short_word_ratio = np.zeros((dev_len, 1))
    
    dev_content_words_list = []
    
    counter = 0
    for sentence in dev_sentence_list:
        char_num = len(sentence)
        dev_avg_sentence_length[counter] = char_num
        
        upper_case_counter = 0
        digit_counter = 0
        for char in sentence:
            if char.isupper():
                upper_case_counter += 1
                continue
            if char.isdigit():
                digit_counter += 1
                continue
        dev_digit_percentage[counter] = upper_case_counter/char_num
        dev_upper_case_percentage[counter] = digit_counter/char_num
        
        for index, char in enumerate(ascii_digits):
            dev_letter_frequency[counter, index] = sentence.count(char) / char_num
        
        whole_str_lower = sentence.lower()
        for index, char in enumerate(ascii_lowercase):
            dev_letter_frequency[counter, index] = whole_str_lower.count(char) / char_num
        
        for index, punctuation in enumerate(punctuation_str):
            dev_punctuation_frequency[counter, index] = sentence.count(punctuation)
        for index, symbol_emoji in enumerate(symbol_emoji_list):
            dev_symbol_emoji_frequency[counter, index] = sentence.count(symbol_emoji)
        
        whole_str_lower_no_punctuation = whole_str_lower
        for index, punctuation in enumerate(punctuation_str_no_quote):
            if punctuation in whole_str_lower_no_punctuation:
                whole_str_lower_no_punctuation = whole_str_lower_no_punctuation.replace(punctuation, " ")
        
        word_list = whole_str_lower_no_punctuation.split()
        word_num = len(word_list)
        if 0 == word_num:
            dev_function_words_frequency[counter, index] = 0
            dev_avg_word_len[counter, 0] = 0
            dev_short_word_ratio[counter, 0] = 0
            dev_content_words_list.append([])
            continue

        word_list_no_stop_word = whole_str_lower_no_punctuation.split()
        for index, stop_word in enumerate(stops):
            dev_function_words_frequency[counter, index] = whole_str_lower_no_punctuation.count(stop_word) / word_num
            while stop_word in word_list_no_stop_word:
                word_list_no_stop_word.remove(stop_word)
        
        word_len = 0
        short_word_num = 0
        for word in word_list:
            word_len_temp = len(word)
            word_len += word_len_temp
            if word_len_temp < 4:
                short_word_num += 1
        dev_avg_word_len[counter, 0] = word_len / word_num
        dev_short_word_ratio[counter, 0] = short_word_num / word_num
        

        dev_content_words_list.append(" ".join(word_list_no_stop_word))
        counter += 1
    feature_mat = np.concatenate((dev_avg_sentence_length, dev_digit_percentage, dev_upper_case_percentage,
                                  dev_digit_frequency, dev_letter_frequency, dev_punctuation_frequency,
                                  dev_symbol_emoji_frequency, dev_function_words_frequency,
                                  dev_avg_word_len, dev_short_word_ratio), axis=1)
    assert(feature_mat.shape == (dev_len, 287))
    return (feature_mat, dev_content_words_list)


In [177]:
%%time
train_feature_mat, train_new_word_dict = getTrainFeature(train_data)

CPU times: user 21 s, sys: 63.4 ms, total: 21 s
Wall time: 21.1 s


In [178]:
%%time
dev_feature_mat, dev_new_word_list = getDevFeature(dev_sentence_list)

CPU times: user 4.6 s, sys: 57.6 ms, total: 4.66 s
Wall time: 4.67 s


In [43]:
%%time
k = 13
knn_clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=k, weights="distance")
knn_clf.fit(train_feature_mat, train_label)

CPU times: user 96.1 ms, sys: 4.52 ms, total: 101 ms
Wall time: 99.7 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=13, p=2,
                     weights='distance')

In [44]:
%%time
predicted = knn_clf.predict(dev_feature_mat)
accuracy = accuracy_score(predicted, dev_label)
print("accuracy: {0:.4f}".format(accuracy))

accuracy: 0.0012
CPU times: user 24.3 s, sys: 1.95 s, total: 26.3 s
Wall time: 16.9 s


In [89]:
%%time
k = 1000
k_neighbors_list = knn_clf.kneighbors(dev_feature_mat, n_neighbors=k, return_distance=False)


CPU times: user 39.3 s, sys: 585 ms, total: 39.9 s
Wall time: 11.6 s


In [90]:
k_neighbors_label = train_label[k_neighbors_list.ravel()].reshape(k_neighbors_list.shape)

In [91]:
repeat_dev_label = np.tile(dev_label, (k,1)).T
result_mat = np.sum(np.equal(k_neighbors_label, repeat_dev_label), axis=1)
result_mat = (0 != result_mat).astype('int')
print(np.mean(result_mat))

0.12440978462850702


In [None]:
file = "../data/glove.6B.100d.txt"
import numpy as np
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    
     
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
     
     
model= loadGloveModel(file) 

In [None]:
# packages
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
def cosine_distance_countvectorizer_method(s1, s2):
    # sentences to list
    allsentences = [s1 , s2]
    
    # text to vector
    vectorizer = CountVectorizer()
    all_sentences_to_vector = vectorizer.fit_transform(allsentences)
    text_to_vector_v1 = all_sentences_to_vector.toarray()[0].tolist()
    text_to_vector_v2 = all_sentences_to_vector.toarray()[1].tolist()
    
    # distance of similarity
    cosine = distance.cosine(text_to_vector_v1, text_to_vector_v2)
    #print('Similarity of two sentences are equal to ',round((1-cosine)*100,2),'%')
    return cosine

In [None]:
import re
from nltk.corpus import stopwords
import pandas as pd
import scipy


def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

def cosine_distance_between_two_words(word1, word2):
    import scipy
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

def calculate_heat_matrix_for_two_sentences(s1,s2):
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    result_list = [[cosine_distance_between_two_words(word1, word2) for word2 in s2] for word1 in s1]
    result_df = pd.DataFrame(result_list)
    result_df.columns = s2
    result_df.index = s1
    return result_df

def cosine_distance_wordembedding_method(s1, s2):
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')

def wordembedding_method(s1):
    return np.mean([model[word] for word in s1],axis=0)

In [None]:
train_set_csv = "../data/v1_4/train_set_v1_4.txt"
dev_set_csv = "../data/v1_4/dev_set_v1_4.txt"

train_csv = pd.read_csv(train_set_csv, sep='\t', header=None)
train_csv = np.array(train_csv)
dev_csv = pd.read_csv(dev_set_csv, sep='\t', header=None)
dev_csv = np.array(dev_csv)

train_label = (train_csv[:,0]).astype('int')
train_sentence_list = train_csv[:,1]
dev_label = (dev_csv[:,0]).astype('int')
dev_sentence_list = dev_csv[:,1]

In [None]:
%%time
cleaned_train_sentence_list = []
for train_sentence in train_sentence_list:
    cleaned_train_sentence_list.append(preprocess(train_sentence))

In [None]:
%%time
cleaned_dev_sentence_list = []
for dev_sentence in dev_sentence_list:
    cleaned_dev_sentence_list.append(preprocess(dev_sentence))

In [None]:
cleaned_train_sentence_df = pd.DataFrame(cleaned_train_sentence_list)
cleaned_dev_sentence_df = pd.DataFrame(cleaned_dev_sentence_list)

In [None]:
cleaned_train_sentence_vec = cleaned_train_sentence_df.apply(wordembedding_method)
cleaned_dev_sentence_vec = cleaned_dev_sentence_df.apply(wordembedding_method)

In [None]:
%%time
for dev_sentence in dev_sentence_list:
    for train_sentence in train_sentence_list:
        cosine_similarity = cosine_distance_countvectorizer_method(dev_sentence, train_sentence)
    #print(cosine_similarity)
    break