# Code to gather Ngrams from google books

### Data source
The data is take from [googles ngrams](http://storage.googleapis.com/books/ngrams/books/datasetsv3.html)

The ngrams are stored as large compressed files that are sorted and split by size.

The format is: ngram TAB year TAB match_count TAB volume_count (Repeat) TAB year TAB match_count TAB volume_count, ... 

Each files size is around 400MB - 500MB and around 1400MB if you extract it for the 2grams. As n changes, the file size and amount may change. n = 1 has 24, n = 2 has 489,  n = 3 has 6881, n = 4 has 6668, n = 5 has 19423 files.

### Extraction method 

I used python to get the data. I processed the data one file at a time with a buffer without saving the file. I processed each file selecting words without parts of speech tags and summing the total occurrence from 1990 to February 2020. I then used a priority queue and took the top 1000 items. I then saved the file. There is an example of an intermediate random file for the bigram showing the occurrence then the ngram:

A Pr intermediate file for bigram:
    (14561073, 'Proceedings of'),
    (10833244, 'Prime Minister'),
    (10588830, 'Proc .'),
    (7036010, 'Prior to'),
    (6596211, 'Professor of'),
    (5769636, 'Program ,'),
    (5600473, 'Princeton University'),
    (4533204, 'Program .'),

I will mention that I skipped over _ symbol because there are part of speech tags within the data that I am uninterested in. This also means I skipped _ symbol though, which won't be in a common ngram for my use case.

### Merging method

I then created another priority queue with a size of 10,000 and went through each file. I used 2 priority queues to take ngrams with and without symbols or special characters. The files were saved.

A 2gram without symbols or special characters file:
    (5300515095, 'of the')
    (2950334073, 'in the')
    (1939208961, 'to the')
    (1250796674, 'and the')
    (1191540665, 'on the')
    (949312195, 'for the')
    (890209782, 'to be')
    (786258968, 'of a')
 
### The data

The data files are too large, so I only included the 1 gram and 2 gram with no symbols data. If you want access to the raw data create an issue and i'll find another way to get it.
 
### Future work

Right now, I have only collected ngrams for n = 1 and 2. It takes a lot of time to download and unzip the files. In the future I would like to use a cloud provider and create several instances at once to speed the job up/ make it feasible.



In [146]:
import io
import gzip
import requests
import pickle
import os
import glob
import heapq
import sys
from collections import deque
from lxml import html

class PriorityQueue:
    def __init__(self, max_size):
        self.queue = []
        self.max_size = max_size
        
    def push(self, priority, item):
        if len(self.queue) < self.max_size:
            heapq.heappush(self.queue, (priority, item))
        elif priority > self.queue[0][0]:
            heapq.heapreplace(self.queue, (priority, item))

    def pop(self):
        return heapq.heappop(self.queue)
    
    def heapsort(self):
        # reverses with q so I can merge max data more easily
        q = deque()
        while self.queue:
            q.appendleft(self.pop())
        return q
    
def process(pq, lines, n):
    for line in lines:
        line = line.decode('utf-8').split()
        ngram = ' '.join(line[0:n])
        if '_' in ngram:
        # Side effect, removes all instances of _ as well a _POS tags
            continue
        total_occurrence = 0
        for data_by_date in line[n:]:
            date, occurrence, _ = data_by_date.split(',')
            if int(date) >= 1990: # newer content
                total_occurrence += int(occurrence)  
        pq.push(total_occurrence, ngram)

def get_ngram_urls(n):
    # datasets were generated in February 2020 on 17th
    url = f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/eng-{n}-ngrams_exports.html'
    response = requests.get(url)
    webpage = html.fromstring(response.content)
    links = webpage.xpath('//a/@href')
    return links[2:-5] # header/footer links

def get_file(url):
    response = requests.get(url, stream=True)
    buffer_data =io.BytesIO(response.content)
    gzip_fd = gzip.GzipFile(fileobj=buffer_data)
    return gzip_fd

def write_file(file_name, data):
    with open(file_name, 'wb') as fp:
        pickle.dump(data, fp)
        
def get_files(n):
    return [file for file in glob.glob(f'{n}gram*.txt')]

def read_file(file_name):
    with open(file_name, 'rb') as fp:
        data = pickle.load(fp)
    return data

def has_symbols(s):
    return not s.replace(' ', '').isalpha()

def get_most_frequent_ngram(n):
    '''Data is merged together and split depending on if there are symbols in the ngram'''
    pq = PriorityQueue(10000)
    pq_symbols = PriorityQueue(10000)
    files = get_files(n)
    for file_name in files:
        data = read_file(file_name)
        for line in data:
            total_occurrence, ngram = line
            if has_symbols(ngram):
                pq_symbols.push(total_occurrence, ngram)
            else:
                pq.push(total_occurrence, ngram)
            
    data = pq.heapsort()
    data_symbols = pq_symbols.heapsort()
    return data, data_symbols

def extract_ngrams_to_files(n):
    '''This function extracts the ngram and creates a file of the top 1000 ngram for the file'''
    BUF_SIZE = 20000
    urls = get_ngram_urls(n)
    for index, url in enumerate(urls):
        pq = PriorityQueue(1000)
        bigfile = get_file(url)
        tmp_lines = bigfile.readlines(BUF_SIZE)
        while tmp_lines:
            process(pq, tmp_lines, n)
            tmp_lines = bigfile.readlines(BUF_SIZE)

        data = pq.heapsort()
        file_name = f'{str(n)}gram-{str(index)}.txt'
        write_file(file_name, data)
        
def merge_ngrams_from_files(n):
    '''This function extracts the ngram and creates the top 10000 n gram with and without symbols'''
    data, data_symbols = get_most_frequent_ngram(n)
    file_name_data = f'{n}grams_no_symbols.txt'
    file_name_data_symbols = f'{n}grams_all_symbols.txt'
    write_file(file_name_data, data)
    write_file(file_name_data_symbols, data_symbols)

def main(n):
    extract_ngrams_to_files(n)
    merge_ngrams_from_files(n)


In [18]:
# If you want to to get just the ngrams in sorted order
import pickle
def read_file(file_name):
    with open (file_name, 'rb') as fp:
        data = pickle.load(fp)
    return data

data = read_file('1grams_no_symbols.txt')
ngrams_data = [ngram for _, ngram in data]

with open('1gram.txt', 'w') as f:
    for line in ngrams_data:
        weird_symbol = max([ord(char) for char in line])
        if weird_symbol < 122: # 122 is z
            f.write(line+'\n')