In [12]:
import os
import re
import operator
import nltk 
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_transform as tft

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

DATA_HOME = os.path.join('data/')
TRAIN_FILE_NAME = 'fulldocs.tsv'

Unnamed: 0,Url,Title,Body
0,https://answers.yahoo.com/question/index?qid=2...,The hot glowing surfaces of stars emit energy ...,Science & Mathematics PhysicsThe hot glowing s...
1,http://childparenting.about.com/od/physicalemo...,Developmental Milestones and Your 8-Year-Old C...,School-Age Kids Growth & DevelopmentDevelopmen...
2,http://visihow.com/Check_for_Lice_Nits,Check for Lice Nits,Check for Lice NitsEdited by Mian Sheilette On...
3,http://www.nytimes.com/2010/01/05/business/glo...,Dubai Opens a Tower to Beat All,Global BusinessDubai Opens a Tower to Beat All...
4,http://www.realtor.com/realestateandhomes-sear...,"Coulterville, CA Real Estate & Homes for Sale","Coulterville, CA Real Estate & Homes for Sale4..."


In [169]:
nrows = 100000

data_train = pd.read_csv(DATA_HOME + TRAIN_FILE_NAME, header=None, nrows=nrows, sep='\t')
data_train.columns = ['Url', 'Title', 'Body']

data_train.head()

Unnamed: 0,Url,Title,Body
0,https://answers.yahoo.com/question/index?qid=2...,The hot glowing surfaces of stars emit energy ...,Science & Mathematics PhysicsThe hot glowing s...
1,http://childparenting.about.com/od/physicalemo...,Developmental Milestones and Your 8-Year-Old C...,School-Age Kids Growth & DevelopmentDevelopmen...
2,http://visihow.com/Check_for_Lice_Nits,Check for Lice Nits,Check for Lice NitsEdited by Mian Sheilette On...
3,http://www.nytimes.com/2010/01/05/business/glo...,Dubai Opens a Tower to Beat All,Global BusinessDubai Opens a Tower to Beat All...
4,http://www.realtor.com/realestateandhomes-sear...,"Coulterville, CA Real Estate & Homes for Sale","Coulterville, CA Real Estate & Homes for Sale4..."


### Load data

In [178]:
import ir_datasets
dataset = ir_datasets.load('msmarco-document')

[INFO] Please confirm you agree to the MSMARCO data usage agreement found at <http://www.msmarco.org/dataset.aspx>
[INFO] If you have a local copy of https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz, you can symlink it here to avoid downloading it again: C:\Users\marti\.ir_datasets\downloads\d4863e4f342982b51b9a8fc668b2d0c0
[INFO] [starting] https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz
[INFO] [finished] https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.trec.gz: [1:00:43] [8.50GB] [2.33MB/s]
                                                                                                        

In [185]:
dataset_sample = dataset.docs_iter()[0:100000]

[INFO] [starting] building docstore
docs_iter: 86262it [02:04, 694.85it/s]
[INFO] [error] docs_iter: [02:04] [86262it] [694.83it/s]
[INFO] [error] building docstore [02:04]


KeyboardInterrupt: 

In [187]:
len(dataset)

TypeError: object of type 'Dataset' has no len()

In [194]:
counter = 0
length = 10000
dataset_sample = []
for doc in dataset.docs_iter():
    if counter > length:
        break
    # print(doc)
    dataset_sample.append(doc)
    counter += 1

In [195]:
len(dataset_sample)

10001

In [196]:
dataset_sample[0]

MsMarcoDocument(doc_id='D1555982', url='https://answers.yahoo.com/question/index?qid=20071007114826AAwCFvR', title='The hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?', body='Science & Mathematics Physics\nThe hot glowing surfaces of stars emit energy in the form of electromagnetic radiation.?\nIt is a good approximation to assume that the emissivity e is equal to 1 for these surfaces.\nFind the radius of the star Rigel, the bright blue star in the constellation Orion that radiates energy at a rate of 2.7 x 10^32 W and has a surface temperature of 11,000 K. Assume that the star is spherical.\nUse σ =... show more\nFollow 3 answers\nAnswers\nRelevance\nRating\nNewest\nOldest\nBest Answer: Stefan-Boltzmann law states that the energy flux by radiation is proportional to the forth power of the temperature: q = ε · σ · T^4 The total energy flux at a spherical surface of Radius R is Q = q·π·R² = ε·σ·T^4·π·R² Hence the radius is R = √ ( Q / (ε·σ·T^4·π) ) 

In [72]:
train_queries = pd.read_csv(DATA_HOME + 'queries.doctrain.tsv', header=None, nrows=nrows, sep='\t')
train_queries.columns = ['qid', 'query']

train_top100 = pd.read_csv(DATA_HOME + 'msmarco-doctrain-top100', header=None, nrows=nrows, sep='\t')

train_queries.head()

Unnamed: 0,qid,query
0,1185869,)what was the immediate impact of the success ...
1,1185868,_________ justice is designed to repair the ha...
2,1183785,elegxo meaning
3,645590,what does physical medicine do
4,186154,feeding rice cereal how many times per day


In [71]:
train_top100.head()

Unnamed: 0,0
0,1185869 Q0 D59221 1 -4.80433 IndriQueryLikelihood
1,1185869 Q0 D59220 2 -4.92127 IndriQueryLikelihood
2,1185869 Q0 D2192591 3 -5.05215 IndriQueryLikel...
3,1185869 Q0 D2777518 4 -5.05486 IndriQueryLikel...
4,1185869 Q0 D2371978 5 -5.07048 IndriQueryLikel...


In [74]:
train_queries[train_queries['qid']==1185869]['query']

0    )what was the immediate impact of the success ...
Name: query, dtype: object

In [11]:
VOCAB_SIZE = 10000

def preprocessing_fn(inputs):
    """Preprocess input columns into transformed columns."""
    url = inputs['Url']
    title = inputs['Title']
    body = inputs['Body']
    
    print(body)
    
    tk = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
    tk.fit_on_texts(body)

    tfidf_mat = tk.sequences_to_matrix(tk.texts_to_sequences(body), mode='tfidf')
    
    return {
        'tfidf_mat': tfidf_mat
    }

preprocessing_fn(data_train[:5])

0    Science & Mathematics PhysicsThe hot glowing s...
1    School-Age Kids Growth & DevelopmentDevelopmen...
2    Check for Lice NitsEdited by Mian Sheilette On...
3    Global BusinessDubai Opens a Tower to Beat All...
4    Coulterville, CA Real Estate & Homes for Sale4...
Name: Body, dtype: object


{'tfidf_mat': array([[0.   , 2.522, 0.693, ..., 0.   , 0.   , 0.   ],
        [0.   , 2.924, 3.332, ..., 0.   , 0.   , 0.   ],
        [0.   , 4.198, 3.471, ..., 0.   , 0.   , 0.   ],
        [0.   , 3.756, 3.003, ..., 0.   , 0.   , 0.   ],
        [0.   , 0.   , 0.   , ..., 0.   , 0.   , 0.   ]])}

In [141]:
nltk.download('stopwords')

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def filter_tokens(tokens, verbose=False):
    if verbose:
        print('Length before: {}'.format(len(tokens)))
    # Remove special char tokens
    tokens = [token for token in tokens if token not in '...,?:;·|``()&\'\'\'s____--']
    
    # Remove the ending of tokens ending with ".\d"
    tokens = [re.sub(r'\.\d', '', token) for token in tokens]
    
    # Remove trailing dots
    tokens = [token.rstrip('.') for token in tokens]

    # Remove trailing slashes
    tokens = [token.rstrip('/') for token in tokens]

    # Remove all empty and one letter tokens
    tokens = [token for token in tokens if not len(token) < 2]
    
    # Remove long words, which are most likely a concatination of several words
    word_max_length = 25
    tokens = [token for token in tokens if not len(token) > word_max_length]
    
    # Remove all nr tokens
    tokens = [token for token in tokens if not is_number(token)]
    
    # Remove math equations
    tokens = [token for token in tokens if not re.search(r'\^|·|π|\d+,\d+|\d\/|\+', token)]
    
    # Remove stopwords
    s_words = stopwords.words('english')
    tokens = [token for token in tokens if token not in s_words]
        
    # Split tokens where there is a dot between words
    new_tokens = []
    dot_regex = r'.{3,}\..'
    for token in tokens:
        if re.match(dot_regex, token):
            splitted_tokens = token.split('.')
            for s_token in splitted_tokens:
                new_tokens.append(s_token)
        else:
            new_tokens.append(token)
        
    tokens = new_tokens
    
    if verbose:
        print('Length after: {}'.format(len(tokens)))

    return tokens

test_tokens = ['science', '&', 'mathematics', 'physicsthe', 'hot', 'glowing', 'surfaces', 'of', 'stars', 'emit', 'energy', 'in', 'the', 'form', 'of', 'electromagnetic', 'radiation', '.', '?', 'it', 'is', 'a', 'good', 'approximation', 'to', 'assume', 'that', 'the', 'emissivity', 'e', 'is', 'equal', 'to', '1', 'for', 'these', 'surfaces', '.', 'find', 'the', 'radius', 'of', 'the', 'star', 'rigel', ',', 'the', 'bright', 'blue', 'star', 'in', 'the', 'constellation', 'orion', 'that', 'radiates', 'energy', 'at', 'a', 'rate', 'of', '2.7', 'x', '10^32', 'w', 'and', 'has', 'a', 'surface', 'temperature', 'of', '11,000', 'k.', 'assume', 'that', 'the', 'star', 'is', 'spherical', '.', 'use', 'σ', '=', '...', 'show', 'morefollow', '3', 'answersanswersrelevanceratingnewestoldestbest', 'answer', ':', 'stefan-boltzmann', 'law', 'states', 'that', 'the', 'energy', 'flux', 'by', 'radiation', 'is', 'proportional', 'to', 'the', 'forth', 'power', 'of', 'the', 'temperature', ':', 'q', '=', 'ε', '·', 'σ', '·', 't^4', 'the', 'total', 'energy', 'flux', 'at', 'a', 'spherical', 'surface', 'of', 'radius', 'r', 'is', 'q', '=', 'q·π·r²', '=', 'ε·σ·t^4·π·r²', 'hence', 'the', 'radius', 'is', 'r', '=', '√', '(', 'q', '/', '(', 'ε·σ·t^4·π', ')', ')', '=', '√', '(', '2.7x10+32', 'w', '/', '(', '1', '·', '5.67x10-8w/m²k^4', '·', '(', '1100k', ')', '^4', '·', 'π', ')', ')', '=', '3.22x10+13', 'msource', '(', 's', ')', ':', 'http', ':', '//en.wikipedia.org/wiki/stefan_bolt', '...', 'schmiso', '·', '1', 'decade', 'ago0', '18', 'commentschmiso', ',', 'you', 'forgot', 'a', '4', 'in', 'your', 'answer', '.', 'your', 'link', 'even', 'says', 'it', ':', 'l', '=', '4pi', '(', 'r^2', ')', 'sigma', '(', 't^4', ')', '.', 'using', 'l', ',', 'luminosity', ',', 'as', 'the', 'energy', 'in', 'this', 'problem', ',', 'you', 'can', 'find', 'the', 'radius', 'r', 'by', 'doing', 'sqrt', '(', 'l/', '(', '4pisigma', '(', 't^4', ')', ')', '.', 'hope', 'this', 'helps', 'everyone.caroline', '·', '4', 'years', 'ago4', '1', 'comment', '(', 'stefan-boltzmann', 'law', ')', 'l', '=', '4pi', '*', 'r^2', '*', 'sigma', '*', 't^4', 'solving', 'for', 'r', 'we', 'get', ':', '=', '>', 'r', '=', '(', '1/', '(', '2t^2', ')', ')', '*', 'sqrt', '(', 'l/', '(', 'pi', '*', 'sigma', ')', ')', 'plugging', 'in', 'your', 'values', 'you', 'should', 'get', ':', '=', '>', 'r', '=', '(', '1/', '(', '2', '(', '11,000k', ')', '^2', ')', ')', '*', 'sqrt', '(', '(', '2.7', '*', '10^32w', ')', '/', '(', 'pi', '*', '(', '5.67', '*', '10^-8', 'w/m^2k^4', ')', ')', ')', 'r', '=', '1.609', '*', '10^11', 'm', '?', '·', '3', 'years', 'ago0', '1', 'commentmaybe', 'you', 'would', 'like', 'to', 'learn', 'more', 'about', 'one', 'of', 'these', '?', 'want', 'to', 'build', 'a', 'free', 'website', '?', 'interested', 'in', 'dating', 'sites', '?', 'need', 'a', 'home', 'security', 'safe', '?', 'how', 'to', 'order', 'contacts', 'online', '?']
filter_tokens(test_tokens, True)

Length before: 379
Length after: 121


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['science',
 'mathematics',
 'physicsthe',
 'hot',
 'glowing',
 'surfaces',
 'stars',
 'emit',
 'energy',
 'form',
 'electromagnetic',
 'radiation',
 'good',
 'approximation',
 'assume',
 'emissivity',
 'equal',
 'surfaces',
 'find',
 'radius',
 'star',
 'rigel',
 'bright',
 'blue',
 'star',
 'constellation',
 'orion',
 'radiates',
 'energy',
 'rate',
 'surface',
 'temperature',
 'assume',
 'star',
 'spherical',
 'use',
 'show',
 'morefollow',
 'answer',
 'stefan-boltzmann',
 'law',
 'states',
 'energy',
 'flux',
 'radiation',
 'proportional',
 'forth',
 'power',
 'temperature',
 'total',
 'energy',
 'flux',
 'spherical',
 'surface',
 'radius',
 'hence',
 'radius',
 '1100k',
 'msource',
 'http',
 'schmiso',
 'decade',
 'ago0',
 'commentschmiso',
 'forgot',
 'answer',
 'link',
 'even',
 'says',
 '4pi',
 'sigma',
 'using',
 'luminosity',
 'energy',
 'problem',
 'find',
 'radius',
 'sqrt',
 '4pisigma',
 'hope',
 'helps',
 'everyone',
 'caroline',
 'years',
 'ago4',
 'comment',
 'stefan-bo

### Preprocess Pandas data

In [176]:
import contractions

def preprocess_data():
    # Remove entries with "NaN" as body text
    data_train_new = data_train[data_train['Body'].notnull()]
    
    # Update the row indicies
    data_train_new = data_train_new.reset_index(drop=True)
    
    # Expand contractions
    for i, entry in enumerate(data_train_new['Body']):
        try:
            s = contractions.fix(entry)
        except Exception as e:
            print(i)
            print(data_train_new['Body'][i])
            print(data_train_new['Url'][i])
            print(repr(e))
        
    # data_train_new['Body'] = [contractions.fix(entry) for entry in data_train_new['Body']]
    
    # Lowercase
    data_train_new['Body'] = [entry.lower() for entry in data_train_new['Body']]
    
    # Remove any URLs from the text
    # Regex taken from here: https://gist.github.com/gruber/8891611
    url_regex = r'(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))'
    data_train_new['Body'] = [re.sub(url_regex, ' ', entry) for entry in data_train_new['Body']]
        
    # Tokenize
    data_train_new['Word tokenize'] = [filter_tokens(word_tokenize(entry)) for entry in data_train_new['Body']]
        
    # print(data_train_new['Word tokenize'][:5])
    # print(data_train_new['Word tokenize'][0])
    
    return data_train_new
    
    
data_train_processed = preprocess_data()

2675
Local News Atlanta PoliticsPresident Obama’s mixed record of minority judicial appointeesBy David A. Love - February 6, 2014tweetPresident Barack Obama speaks while nominating Cornelia T. L. Pillard (2nd-L), a law professor, Patricia Ann Millett (R), an appellate lawyer, and Robert L. Wilkins (L), to become federal judges, during an event in the Rose Garden of the White House June 4, 2013 in Washington, DC. If confirmed by the U.S. Senate the three nominees will fill three vacancies on United States Court of Appeals for the District of Columbia. (Photo by Mark Wilson/Getty Images)How is President’s Obama’s record on diversity among judicial nominees?If you ask members of the Congressional Black Caucus and some civil rights leaders, they will say the president falls short when it comes to minorities on the federal bench, especially in states such as Georgia. But that’s only part of the story. Obama has a record of accomplishment on diverse appointees, but he made a deal in Georgia 

KeyboardInterrupt: 

### Preprocess data ir_datasets

In [204]:
import contractions

# Regex taken from here: https://gist.github.com/gruber/8891611
url_regex = r'(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))'

def create_tokens(body):
    # Expand contractions       
    try:
        body = contractions.fix(body)
    except Exception as e:
        print(body)
    
    # body = ' '.join([contractions.fix(word) for word in body.split()])

    # Lowercase
    body = body.lower()

    # Remove any URLs from the text
    body = re.sub(url_regex, ' ', body)    

    # Tokenize
    tokens = filter_tokens(word_tokenize(body))
    
    return tokens
    
def preprocess_dataset():
    # Remove entries that don't have strings as body text
    dataset_new = [entry for entry in dataset_sample if isinstance(entry[3], str)]
    
    # Update the row indicies
    # data_train_new = data_train_new.reset_index(drop=True)
    
    tokens_dict = {}
    for entry in dataset_new:
        tokens_dict[entry[0]] = create_tokens(entry[3])
        
    return tokens_dict
    
    
dataset_processed = preprocess_dataset()

İzmir, historically Smyrna, city in western Turkey.
The country’s third largest city and one of its largest ports, İzmir lies at the head of the sheltered Gulf of İzmir on the deeply indented coast of the Aegean Sea.
Pop.
(2000) 2,232,265; (2013 est.)
2,803,418.
Clock tower, İzmir, Turkey.
Fritz Henle/Photo Researchers
Historyİzmir is one of the oldest cities of the Mediterranean world and has been of almost continuous historical importance during the last 5,000 years.
Excavations indicate settlement contemporary with that of the first city of Troy, dating from the 3rd millennium bce.
Greek settlement is first clearly attested by the presence of pottery dating from about 1000 bce.
According to the Greek historian Herodotus, the Greek city was founded by Aeolians but soon was seized by Ionians.
From modest beginnings, it grew into a stately city in the 7th century, with massive fortifications and blocks of two-storied houses.
Captured by Alyattes of Lydia about 600 bce, it ceased to exi

KeyboardInterrupt: 

In [174]:
data_train.loc[2675, 'Body']

'Monosaccharides> Monosaccharides, Biology | AssignmentsQuestion-What type of bond holds the two monosaccharides together in a disaccharide?a) Ester bondb) Hydrogen bondc) Amide bondd) Ether bond (or glycosidic bond)e) Intermolecular bond.Posted Date: 2/24/2014 12:46:16 AM | Location : United Kingdom (UK)Excel in your CourseExperts are helping students not just improving grades but also to provide better learning of subject concepts and its problem statements. They are providing you world class assistance which may help you to excel in course or assignments.Tutor service in UK?UK Assignment HelpUK Assessments HelpUK Custom Writing ServicesHelp with UK StudiesPaper/Essay Editing-FormattingManagement/Programming/Engineering HelpOrder NowMost Recent PostsUK Perdisco Assignment Help, UK PERDISCO Practice Set HelpUniversity of Brighton, Assignment Help, Tutor Help UKUniversity for the Creative Arts, Assignment Help UKThe University of Warwick, Assignment Help UKUniversity of Bradford Assign

In [125]:
data_train_processed.loc[1, 'Url']

'http://childparenting.about.com/od/physicalemotionalgrowth/tp/Child-Development-Your-Eight-Year-Old-Child.htm'

In [144]:
data_train_processed.loc[1, 'Body']

'school-age kids growth & developmentdevelopmental milestones and your 8-year-old child8-year-olds are expanding their worldsby katherine lee | reviewed by joel forman, mdupdated february 10, 2018share pin emailprinteight-year-olds are becoming more confident about themselves and who they are. at age 8, your child will likely have developed some interests and hobbies and will know what he or she likes or does not   the same time, children this age are learning more about the world at large and are also better able to navigate social relationships with others more independently, with less guidance from parents. at home, 8-year-olds are able to tackle more complicated household chores and take on more responsibility for taking care of themselves, even helping out with younger   general, according to the cdc, these are some changes you may see in your child:shows more independence from parents and family.starts to think about the future.understands more about his or her place in the world

In [165]:
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

## Create Vocabulary
vocabulary = set()

for doc in data_train_processed['Word tokenize']:
    vocabulary.update(doc)

vocabulary = list(vocabulary)

# Intializating the tfIdf model
tfidf = TfidfVectorizer(vocabulary=vocabulary)

# Fit the TfIdf model
tfidf.fit(data_train_processed['Body'])

# Transform the TfIdf model
tfidf_tran=tfidf.transform(data_train_processed['Body'])

In [146]:
vocab_count = dict()

for doc in data_train_processed['Word tokenize']:
    for token in doc:
        if token not in vocab_count:
            vocab_count[token] = 1
        else:
            vocab_count[token] += 1
    
vocab_count = dict(sorted(vocab_count.items(), key=lambda item: item[1], reverse=True))
vocab_count

{'also': 1265,
 'one': 1041,
 'may': 1002,
 'new': 774,
 'time': 767,
 'use': 731,
 'see': 629,
 'would': 611,
 'like': 559,
 'people': 543,
 'day': 543,
 'used': 537,
 'two': 535,
 'first': 534,
 'edit': 512,
 'many': 474,
 'need': 447,
 'get': 439,
 'world': 439,
 'fallacy': 438,
 'make': 435,
 'control': 432,
 'system': 432,
 'the': 429,
 'data': 422,
 'years': 416,
 'year': 404,
 'even': 401,
 'us': 385,
 'using': 370,
 'information': 355,
 'earned': 343,
 'states': 337,
 'well': 334,
 'card': 334,
 'good': 330,
 'work': 328,
 'back': 321,
 'state': 321,
 'found': 315,
 'light': 308,
 'much': 305,
 'way': 304,
 'high': 304,
 'different': 302,
 'retrieved': 300,
 'help': 299,
 'called': 298,
 'united': 296,
 'american': 294,
 'made': 292,
 'sd': 292,
 'right': 287,
 'days': 286,
 'could': 285,
 'within': 283,
 'life': 281,
 'order': 280,
 'part': 280,
 'name': 278,
 'history': 278,
 'type': 277,
 'average': 277,
 'europe': 276,
 'another': 275,
 'points': 272,
 'must': 271,
 'often'

In [150]:
vocab_count = dict(sorted(vocab_count.items(), key=lambda item: item[1]))
vocab_count

{'physicsthe': 1,
 'emissivity': 1,
 'rigel': 1,
 'orion': 1,
 'radiates': 1,
 'morefollow': 1,
 '1100k': 1,
 'msource': 1,
 'commentschmiso': 1,
 'luminosity': 1,
 '4pisigma': 1,
 'ago4': 1,
 'plugging': 1,
 'school-age': 1,
 'developmentdevelopmental': 1,
 'child8-year-olds': 1,
 'worldsby': 1,
 'forman': 1,
 'mdupdated': 1,
 '2018share': 1,
 'emailprinteight-year-olds': 1,
 'hobbies': 1,
 'teamwork': 1,
 'routinesfabrice': 1,
 'lerouge/getty': 1,
 'figuring': 1,
 'self-care': 1,
 'developmentimage': 1,
 'source/getty': 1,
 'imagesfor': 1,
 'puberty': 1,
 'biking': 1,
 'non-sports-related': 1,
 'developmentjohn': 1,
 'howard/getty': 1,
 'self-identity': 1,
 'flip-flopping': 1,
 'self-confidence': 1,
 'empathy': 1,
 'developmenttom': 1,
 'merton/getty': 1,
 'eight-year-olds': 1,
 'critically': 1,
 'developmentchristopher': 1,
 'futcher/getty': 1,
 'imagesthis': 1,
 'quirks': 1,
 'pediatrician': 1,
 'verywellyour': 1,
 '8-10-year-olds': 1,
 'chaplin': 1,
 'aldao': 1,
 'meta-analytic': 

In [147]:
len(vocab_count)

52645

In [148]:
vocabulary

['',
 'underestimate',
 'piece',
 'rigid',
 'gradually',
 'repairhow',
 'gearworkout',
 'counterknowledge',
 'fghn2866ppfrigidaire',
 'lavender',
 'ratesdescription',
 'l',
 'sternocleidomastoid',
 'james',
 'semester-based',
 'code2011',
 'marxism',
 'used51',
 'waterq',
 'universitytommy',
 '195°',
 'upgrades',
 'debonair',
 'anarchism',
 'lips',
 'pocket',
 'belize',
 'over-reaches',
 'vindicating',
 'shoeshiking',
 'twinsanother',
 'hb',
 'top-level',
 'definition1',
 'españold',
 'neill',
 'science301',
 'sensed',
 'jepson',
 'kingdoms',
 'help11',
 'motorsport',
 'circularcrusty',
 'instead',
 'nursery',
 '2011-01-05',
 'death–for',
 'elextroencephalogramwhat',
 'federally',
 'hymn',
 'milkfat',
 'fall-bloomers',
 'forestall',
 '0-11',
 'imagessupport',
 'glad',
 'gangsters',
 'push-pull',
 'hitranslator',
 'quail',
 'elections',
 'chair',
 'falleth',
 'tuma',
 'yellowproduct',
 'sleuthbuffaloa',
 'kislevhanukkah',
 'gawker',
 'tjm2',
 'dickens',
 'orphus',
 'iridium-bearing',
 '

In [149]:
len(vocabulary)

52645

In [135]:
tfidf_tran.shape

(496, 52679)

In [136]:
tfidf.get_feature_names()[:20]

['',
 'underestimate',
 'piece',
 'rigid',
 'gradually',
 'repairhow',
 'gearworkout',
 'counterknowledge',
 'fghn2866ppfrigidaire',
 'lavender',
 'ratesdescription',
 'l',
 'sternocleidomastoid',
 'james',
 'semester-based',
 'code2011',
 'marxism',
 'used51',
 'waterq',
 'universitytommy']

In [166]:
def gen_vector_T(tokens):
    Q = np.zeros((len(vocabulary)))    
    x= tfidf.transform(tokens)
    #print(tokens[0].split(','))
    for token in tokens[0].split(','):
        #print(token)
        try:
            ind = vocabulary.index(token)
            Q[ind]  = x[0, tfidf.vocabulary_[token]]
        except:
            pass
    return Q

In [152]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [167]:
def cosine_similarity_T(k, query):
    preprocessed_query = re.sub("\W+", " ", query).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    q_df.loc[0,'q_clean'] = ','.join(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector_T(q_df['q_clean'])
    for d in tfidf_tran.A:
        d_cosines.append(cosine_sim(query_vector, d))
                    
    out = np.array(d_cosines).argsort()[-k:][::-1]
    #print("")
    d_cosines.sort()
    a = pd.DataFrame()
    for i,index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Url'] = data_train_processed['Url'][index]
        a.loc[i,'Body'] = data_train_processed['Body'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

In [168]:
cosine_similarity_T(10, 'computer science')

Unnamed: 0,index,Url,Body,Score
0,113,https://www.sciencelearn.org.nz/resources/469-...,explore topics explore concepts teacher pldsig...,0.103609
1,269,http://www.ece.k-state.edu/graduate/grapositio...,gra positionsthe department of electrical and ...,0.091667
2,370,https://quizlet.com/38941450/bios-flash-cards/,33 terms murderangel plusbioslearn flashcards ...,0.054857
3,102,http://techotv.com/download-upgrade-install-mi...,"posted on november 28, 2012 by saurabhdownload...",0.048499
4,426,http://answers.microsoft.com/en-us/windows/for...,"sd seth d asked onnovember 16, 2009q: network ...",0.04792
5,418,http://www.pnas.org/content/107/17/7823.full,"pre-columbian agricultural landscapes, ecosyst...",0.043048
6,491,http://www.itjobswatch.co.uk/jobs/uk/software%...,permanent it jobs contract it jobsperiod 6 mon...,0.040352
7,13,http://www.answers.com/Q/What_political_party_...,why do people belong to political parties?it b...,0.040188
8,9,https://answers.yahoo.com/question/index?qid=2...,health other - healthi have trouble swallowing...,0.039698
9,428,http://www.123helpme.com/view.asp?id=77294,definition of military disciplinelength: 894 w...,0.038377


In [162]:
data_train['Body'][113]

nan