In [1]:
import numpy as np
import pandas as pd
import math
import string
import pickle
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from scipy.cluster.vq import whiten

def get_tf_idf_query_similarity(docs_tfidf, query_tfidf):
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    return cosineSimilarities

# def l2_norm(a):
#     return math.sqrt(np.dot(a, a))

# def cosine_similarity(a, b):
#     return np.dot(a,b) / (l2_norm(a) * l2_norm(b))

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
punctuation_str = string.punctuation
symbol_emoji_list = [":)",  ";)",  ":(",  ":\\",  ":|", ":]", ":[",
                     ":-)", ";-)", ":-(", ":-\\", ":-|", ":-[", ";-]",
                     ":D", ":P", ":-x", ":'-(", ":_(", ":o)", "XD", ":'(", ":->",
                     "o_O", "T_T", "^o^", 
                     ":-D", ":-P","B-)", "8-)", ":-o", ":-O", ":-0", ":-s", ":-S"]
# TODO hashtag #
# TODO hmmmmm hhuuuuugg

In [2]:
version = "v2_1"
dev_set_path = '../data/{0}/dev_set_{0}.txt'.format(version)
train_set_path = '../data/{0}/train_set_{0}.txt'.format(version)

train_file = open(train_set_path, 'rb')
train_data = pickle.load(train_file)
dev_data = pd.read_csv(dev_set_path, header=None, sep='\t')
#train_label = (np.array(train_data)[:,0]).astype('int')
dev_label = (np.array(dev_data)[:,0]).astype('int')
#train_sentence_list = train_data[1]
dev_sentence_list = dev_data[1]

In [34]:
train_data

{8746: ['Grrr....you must be going crazy!',
  'Hi Becky - Meaghan at airport on way to Boston for petscans',
  'am driving to ohio...look fforward to responding sun....have a great holiday!',
  'Travelling for the Holiday? Send us a pic of your Gettington Bag in action! ...',
  'It is an honor and a pleasure!',
  "Now I'm dreaming of new boots and soft cashmere. Time to go shopping #gno #anntaylor",
  'we let jake take one last week when he had a half day...they need them too!',
  "U are so cute - I don't have one! #anntaylor",
  'How many little black dresses do u have? #anntaylor',
  'Become a fan of the fashionable FB page! facebook #gno #anntaylor',
  'only 4 more hours til Akron!',
  'Still in car....want to jump out....45 minutes eta!!!',
  'Thank you for the love!! MY PLEASURE!',
  'o my gosh i have fallen in love all over again with ann taylor #gno #anntaylorstyle',
  'Ft. Hood officials confirm the 2 other soldiers initially held as suspects have been released',
  'Will they p

In [33]:
file = "../data/glove.6B.100d.txt"
import numpy as np
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    
     
    with open(gloveFile, encoding="utf8" ) as f:
        content = f.readlines()
    model = {}
    for line in content:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
     
     
model= loadGloveModel(file) 

Loading Glove Model
Done. 400000  words loaded!


In [16]:
# packages
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
def cosine_distance_countvectorizer_method(s1, s2):
    # sentences to list
    allsentences = [s1 , s2]
    
    # text to vector
    vectorizer = CountVectorizer()
    all_sentences_to_vector = vectorizer.fit_transform(allsentences)
    text_to_vector_v1 = all_sentences_to_vector.toarray()[0].tolist()
    text_to_vector_v2 = all_sentences_to_vector.toarray()[1].tolist()
    
    # distance of similarity
    cosine = distance.cosine(text_to_vector_v1, text_to_vector_v2)
    #print('Similarity of two sentences are equal to ',round((1-cosine)*100,2),'%')
    return cosine

In [29]:
import re
from nltk.corpus import stopwords
import pandas as pd
import scipy


def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

def cosine_distance_between_two_words(word1, word2):
    import scipy
    return (1- scipy.spatial.distance.cosine(model[word1], model[word2]))

def calculate_heat_matrix_for_two_sentences(s1,s2):
    s1 = preprocess(s1)
    s2 = preprocess(s2)
    result_list = [[cosine_distance_between_two_words(word1, word2) for word2 in s2] for word1 in s1]
    result_df = pd.DataFrame(result_list)
    result_df.columns = s2
    result_df.index = s1
    return result_df

def cosine_distance_wordembedding_method(s1, s2):
    vector_1 = np.mean([model[word] for word in preprocess(s1)],axis=0)
    vector_2 = np.mean([model[word] for word in preprocess(s2)],axis=0)
    cosine = scipy.spatial.distance.cosine(vector_1, vector_2)
    print('Word Embedding method with a cosine distance asses that our two sentences are similar to',round((1-cosine)*100,2),'%')

def wordembedding_method(s1):
    return np.mean([model[word] for word in s1],axis=0)

In [31]:
train_set_csv = "../data/v1_4/train_set_v1_4.txt"
dev_set_csv = "../data/v1_4/dev_set_v1_4.txt"

train_csv = pd.read_csv(train_set_csv, sep='\t', header=None)
train_csv = np.array(train_csv)
dev_csv = pd.read_csv(dev_set_csv, sep='\t', header=None)
dev_csv = np.array(dev_csv)

train_label = (train_csv[:,0]).astype('int')
train_sentence_list = train_csv[:,1]
dev_label = (dev_csv[:,0]).astype('int')
dev_sentence_list = dev_csv[:,1]

In [19]:
%%time
cleaned_train_sentence_list = []
for train_sentence in train_sentence_list:
    cleaned_train_sentence_list.append(preprocess(train_sentence))

CPU times: user 21.4 s, sys: 1.69 s, total: 23 s
Wall time: 23.2 s


In [24]:
%%time
cleaned_dev_sentence_list = []
for dev_sentence in dev_sentence_list:
    cleaned_dev_sentence_list.append(preprocess(dev_sentence))

CPU times: user 2.78 s, sys: 188 ms, total: 2.97 s
Wall time: 2.99 s


In [25]:
cleaned_train_sentence_df = pd.DataFrame(cleaned_train_sentence_list)
cleaned_dev_sentence_df = pd.DataFrame(cleaned_dev_sentence_list)

In [30]:
cleaned_train_sentence_vec = cleaned_train_sentence_df.apply(wordembedding_method)
cleaned_dev_sentence_vec = cleaned_dev_sentence_df.apply(wordembedding_method)

KeyError: ('budurl', 'occurred at index 0')

In [17]:
%%time
for dev_sentence in dev_sentence_list:
    for train_sentence in train_sentence_list:
        cosine_similarity = cosine_distance_countvectorizer_method(dev_sentence, train_sentence)
    #print(cosine_similarity)
    break

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
 

  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  dist = 1.0 - uv / np.sqrt(uu * vv)


CPU times: user 1min 53s, sys: 736 ms, total: 1min 54s
Wall time: 1min 56s
