In [34]:
import sqlite3
import pandas as pd
import datetime
from shutil import copyfile
import spacy
import numpy as np
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import TfidfVectorizer

### Retrieve browser history as list of strings (page titles)

In [2]:
# Path to chrome history file
user = 'selinaveng'
history_path = '/Users/{}/Library/Application Support/Google/Chrome/Default/History'.format(user)

# Copy the history file since the original is not accesible when Chrome is open
history_copy_path = history_path + '_copy'
copyfile(history_path, history_copy_path)

# Connect to the database
con = sqlite3.connect(history_copy_path)

# Put data in pandas data frame
df = pd.read_sql_query("select title, id, last_visit_time from urls", con)

In [3]:
df.head()

Unnamed: 0,title,id,last_visit_time
0,HBO Nordic | Se hundredevis af serier og film ...,96,13231616523450264
1,"DR | TV, radio, nyheder og meget mere",109,13231844012738778
2,Michael sendte dig en besked,1172,13231860218027893
3,YouTube,1563,13231861124004930
4,Google Maps,1608,13231688537498186


In [4]:
# Convert the time for last visited
def convertChromeTime(ms):
    """Convert the amount of microsends into a datetime object. 
    Google chrome doesn't use Unix epoch.
    """
    return datetime.datetime(1601, 1, 1) + datetime.timedelta(microseconds=ms)

# Sort by last visited to get the most frequent first
df['datetime'] = df['last_visit_time'].apply(convertChromeTime)
df = df.sort_values('datetime', ascending=False)

In [5]:
df.head()

Unnamed: 0,title,id,last_visit_time,datetime
2070,sklearn.feature_extraction.text.TfidfVectorize...,32410,13231871165473594,2020-04-20 15:46:05.473594
2071,sklearn.feature_extraction.text.TfidfVectorize...,32411,13231871165473594,2020-04-20 15:46:05.473594
15235,sklearn tfidfvectorizer - Google Search,45599,13231871164129830,2020-04-20 15:46:04.129830
15166,similarity - Is there an algorithm to identify...,45530,13231871148981848,2020-04-20 15:45:48.981848
15234,find most salient word in sentece - Google Search,45598,13231871144731924,2020-04-20 15:45:44.731924


In [6]:
titles = df['title'].to_list()

In [8]:
titles[:5]

['sklearn.feature_extraction.text.TfidfVectorizer — scikit-learn 0.22.2 documentation',
 'sklearn.feature_extraction.text.TfidfVectorizer — scikit-learn 0.22.2 documentation',
 'sklearn tfidfvectorizer - Google Search',
 'similarity - Is there an algorithm to identify the most salient word from a given set of words? - Stack Overflow',
 'find most salient word in sentece - Google Search']

### Find the most salient words in each title string

In [10]:
stopwords = list(nlp.Defaults.stop_words)

In [19]:
def get_best_word(title):
    
    words = title.split(' ')
    
    return words[0]

In [20]:
get_best_word(titles[0])

'How'

In [39]:
search_words = []

In [40]:
for i in range(len(titles)):
    
    word = get_best_word(titles[i])
    
    if word not in search_words:
        search_words.append(word)
    
    if len(search_words) == 20:
        break

In [41]:
search_words

['How',
 'copy',
 'cross',
 'get',
 'Week10-SQL-for-Chrome-History',
 'chrome',
 'Fetch',
 'Untitled',
 'Home',
 'Opening',
 'ipython',
 'IOPub',
 'sqlite',
 'Analyze',
 'python',
 'access',
 'chrome.history',
 'open',
 'Playing',
 'parse']