some observations    
===============

**vocabulary size**    
tweet tokenizer / no preprocessing = 313803    
tweet tokenizer / with cleaning method = 260580    
tweet tokenizer / with cleaning method, reduce length = 240963

## Create the Vocab Set

In [10]:
import pickle
import pandas as pd
import math
from collections import Counter
import sys
import csv
import string
import re
import emoji
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain
from indexer import TwitterIQ

We'll assign some variables for our `clean` function to use. We're doing so outside of the function itself so that they needn't be defined every time we want to use the clean method.

In [14]:
tokenizer = TweetTokenizer(reduce_len=True)
unicodes2remove = [
    # all kinds of quotes
    u'\u2018', u'\u2019', u'\u201a', u'\u201b', u'\u201c', \
    u'\u201d', u'\u201e', u'\u201f', u'\u2014',
    # all kinds of hyphens
    u'\u002d', u'\u058a', u'\u05be', u'\u1400', u'\u1806', \
    u'\u2010', u'\u2011', u'\u2012', u'\u2013',
    u'\u2014', u'\u2015', u'\u2e17', u'\u2e1a', u'\u2e3a', \
    u'\u2e3b', u'\u2e40', u'\u301c', u'\u3030',
    u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63', \
    u'\uff0d', u'\u00b4'
]

punctuation = string.punctuation.replace('@', '') + ''.join(unicodes2remove)
# regex to match urls (taken from the web)
urlregex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]'
                           '|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# keep @ to be able to recognize usernames
punctuation = string.punctuation.replace('@', '') + ''.join(unicodes2remove)
punctuation = punctuation.replace('#', '')
# a bunch of emoji unicodes
emojis = ''.join(emoji.UNICODE_EMOJI)
emojis = emojis.replace('#', '')
# combined english and german stop words
stop_words = set(stopwords.words('english') + stopwords.words('german'))

In [15]:
def clean(s):
    """
    Normalizes a string (tweet) by removing the urls, punctuation, digits,
    emojis, by putting everything to lowercase and removing the
    stop words. Tokenization is performed aswell.

    :param s the string (tweet) to clean
    :return: returns a list of cleaned tokens
    """
    s = s.replace('[NEWLINE]', '')
    s = s.replace('…', '...')
    s = urlregex.sub('', s).strip()
    s = s.translate(str.maketrans('', '', punctuation + string.digits \
                                  + emojis)).strip()
    s = s.lower()
    s = tokenizer.tokenize(s)
    #s = [w for w in s if w not in stop_words]
    return s

In the next few cells, we'll finish setting everything up. In order:

* `inv_index` is an inverted index (from past assignments) so that we can quickly get terms' document frequencies
* `df` is a Pandas DataFrame containing all the tweets, their authors, IDs, and other info
* `tweets` is a Pandas Series containing the tweets
* `tokenized` is a Pandas Series of lists containing the results of the above `clean` method, so lists of tokenized terms


In [11]:
inv_index = TwitterIQ('tweets.csv')

In [12]:
df = pd.read_csv('tweets.csv', sep='\t', usecols=[1,4], names=['id', 'tweet'])

In [13]:
tweets = df['tweet']

In [16]:
tokenized = tweets.apply(clean)

## TF-IDF

In [35]:
def compute_tfidf(term, doc, tweets):
    counts = Counter(doc)
    term_freq = inv_index[term].freq
    return (1 + math.log10(tf) * (math.log10(idf + 1)))

In [23]:
def tfidf(doc1, doc2, tweets):
    """
    This function calculates the tf-idf scores for two documents and returns them in a DataFrame
    
    doc1,doc2 -> list of tokens
    tweets: a collection of lists of tokenized tweets
    """
    intersect = set(doc1).intersection(set(doc2))
    if not intersect:
        return (pd.DataFrame({}), pd.DataFrame({}))
        
    c1 = Counter(doc1)#token count
    c2 = Counter(doc2)#token count
    d1 = {}
    d2 = {}
    for t in set(doc1):
        if t not in d1.keys() and t in intersect:
            d1[t] = compute_tfidf(t)
    for t in set(doc2):
        if t not in d2.keys() and t in intersect:
            #tf idf
            tf = c2[t]
            df = inv_index[t].freq
            if df == 0:
                continue
            else:
                idf = len(tweets) / df
                tfidf = (1 + math.log10(tf)) * (math.log10(idf))
            d2[t] = tfidf
            
    df_tfidf = pd.DataFrame().from_dict(d1, orient='index')
    df_tfidf[1] = pd.DataFrame().from_dict(d2, orient='index')
    return df_tfidf

In [19]:
def cosine(vec1, vec2):
    if len(vec1) == 0 or len(vec2) == 0:
        return 0
    nominator = 0
    denominator = 0
    vec1_length = 0
    vec2_length = 0
    for v1,v2 in zip(vec1,vec2):
        nominator += v1*v2
        vec1_length += v1*v1
        vec2_length += v2*v2
    vec1_length = math.sqrt(vec1_length)
    vec2_length = math.sqrt(vec2_length)
    denominator = vec1_length * vec2_length 
    return nominator / denominator

In [32]:
def top_x(x, q, tweets):
    """
    x: top x number
    q: query to compare to
    tweets: all the tweets -> assumed to be cleaned/tokenized
    """
    q = clean(q)
    for tweet in tweets[:100]:
        df_tfidf = tfidf(q, tweet, tweets)
        cos = cosine(df_tfidf[0], df_tfidf[1])
        print(cos, df_tfidf[1])

### Examples
Execute the example docs you want to compare.

#### 1

In [21]:
#@Brandon: its fun to play around with the those documents
doc1 = "this is a random tweet Hausarzt Affe Affe Affe".split()
doc2 = "this is random a a a a a tweet Hausarzt Hausarzt Hausarzt Hausarzt Hausarzt I think bla foo Affe".split()

#### 2

In [None]:
doc1 = "i don't think society understands how hurtful it is when this kind of behavior by the POTUS becomes an accepted form of political discourse".split()
doc2 = 'And it is grievously hurtful to our society when vilification becomes an accepted form of political debate and negative campaigning becomes a full-time occupation.'.split()

#### 3

In [None]:
doc1 = "He was a sk8er boi, she said see you later boy".split()
doc2 = "I'm with the sk8er boi, I said see you later boy".split()

#### 4

In [None]:
doc1 = 'and she told me Ich sitze noch in der Küche'.split()
doc2 = 'Was meinst du mit sitting here with nachos'.split()

### Compute scores

In [None]:
df_tfidf = tfidf(doc1, doc2, V, tweets)
df_tfidf

In [22]:
cosine(df_tfidf[0], df_tfidf[1])

0.9999999999999999

In [121]:
t = "@knakatani @ChikonJugular @joofford @SteveBlogs11 https://t.co/WHtaRYGNSY says lifetime risk of cervical cancer in Japan is 1 in 100.  That means HPV is endemic in Japan, and screening is not working well."
t2 = "Heute war ich beim Hausarzt und er hat festgestellt, dass es mir gut geht Haus bin ist sind uns wir die der das dem den Essen Kopf Krebs Krankheit love you the we i hate Microsoft Amazon gekauft verkauft essen trinken."
top_x(0, t3, tokenized, V)

['krankheit', 'zur', 'navigation', 'springenzur', 'suche', 'springen', 'das', 'kranke', 'mädchen', 'den', 'syge', 'pige', 'von', 'michael', 'ancher', 'krankheit', 'ist', 'ein', 'zustand', 'verminderter', 'leistungsfähigkeit', 'der', 'auf', 'funktionsstörungen', 'von', 'einem', 'oder', 'mehreren', 'organen', 'der', 'psyche', 'oder', 'des', 'gesamten', 'organismus', 'beruht', 'und', 'zurückgeht', 'diese', 'störungen', 'werden', 'ihrerseits', 'durch', 'strukturelle', 'veränderungen', 'von', 'zellen', 'und', 'geweben', 'hervorgerufen', 'die', 'lehre', 'von', 'den', 'krankheiten', 'ist', 'die', 'pathologie', 'während', 'die', 'nosologie', 'sich', 'mit', 'der', 'systematischen', 'einteilung', 'von', 'krankheiten', 'beschäftigt', 'inhaltsverzeichnis', 'wortherkunft', 'definition', 'krankheit', 'und', 'gesundheit', 'medizin', 'recht', 'geschichtliche', 'und', 'kulturelle', 'aspekte', 'typische', 'reaktionen', 'bei', 'schwerer', 'krankheit', 'systematik', 'ursachen', 'und', 'verlauf', 'krankhei

0.9632592849164086
{'a'}
1.0
{'können', 'ist', 'nicht', 'wenn', 'hin', 'am', 'für', 'zum', 'sein', 'von', 'meist', 'einen', 'man', 'schwer', 'die', 'bis', 'kann', 'es', 'ohne', 'auch'}
0.9518501587765521
set()
0
{'the', 'major', 'rate', 'in'}
0.9425502851772796
{'by', 'new'}
0.9828325155298657
{'eines'}
1.0
{'of', 'and'}
0.9999999999999999
set()
0
set()
0
{'of', 'probleme', 'the', 'a', 'and'}
0.9714935747644035
{'die', 'in'}
0.9992693586482712
{'a', 'in', 'and', 'obesity', 'by'}
0.9454424810486781
{'obesity', 'and'}
0.9999999999999998
{'obesity'}
1.0
{'bad', 'of', 'the', 'a', 'obesity'}
0.9903950151812544
{'online'}
1.0
set()
0
{'and'}
1.0
set()
0
set()
0
{'videos'}
1.0
{'wieder', 'der', 'auf', 'aber', 'das', 'am', 'ist', 'nicht', 'eine', 'einem', 'den', 'in', 'und', 'schon', 'sie', 'kann'}
0.9561528638849192
{'u'}
1.0
set()
0
set()
0
{'the'}
1.0
{'of', 'the', 'a', 'on', 'in', 'who'}
0.9621066462055818
{'selbst', 'ihrem', 'ihre', 'bei', 'in', 'sich', 'die', 'sie', 'immer', 'um', 'es', 

KeyboardInterrupt: 

In [33]:
top_x(100, 'trump', tokenized)

0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
1.0 trump    3.293
Name: 1, dtype: float64
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: []
0 Empty DataFrame
Columns: []
Index: 