some observations    
===============

**vocabulary size**    
tweet tokenizer / no preprocessing = 313803

## Create the Vocab Set

In [1]:
import pickle
import pandas as pd
import math
from collections import Counter
import sys
import csv
import string
import re
import emoji
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from itertools import chain

In [2]:
df = pd.read_csv('tweets.csv', sep='\t', usecols=[1,4], names=['id', 'tweet'])

In [3]:
tweets = df['tweet']

In [38]:
tokenizer = TweetTokenizer()
unicodes2remove = [
    # all kinds of quotes
    u'\u2018', u'\u2019', u'\u201a', u'\u201b', u'\u201c', \
    u'\u201d', u'\u201e', u'\u201f', u'\u2014',
    # all kinds of hyphens
    u'\u002d', u'\u058a', u'\u05be', u'\u1400', u'\u1806', \
    u'\u2010', u'\u2011', u'\u2012', u'\u2013',
    u'\u2014', u'\u2015', u'\u2e17', u'\u2e1a', u'\u2e3a', \
    u'\u2e3b', u'\u2e40', u'\u301c', u'\u3030',
    u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63', \
    u'\uff0d', u'\u00b4'
]

punctuation = string.punctuation.replace('@', '') + ''.join(unicodes2remove)
# regex to match urls (taken from the web)
urlregex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]'
                           '|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# keep @ to be able to recognize usernames
punctuation = string.punctuation.replace('@', '') + ''.join(unicodes2remove)
punctuation = punctuation.replace('#', '')
# a bunch of emoji unicodes
emojis = ''.join(emoji.UNICODE_EMOJI)
emojis = emojis.replace('#', '')
# combined english and german stop words
stop_words = set(stopwords.words('english') + stopwords.words('german'))

In [39]:
def clean(s):
    """
    Normalizes a string (tweet) by removing the urls, punctuation, digits,
    emojis, by putting everything to lowercase and removing the
    stop words. Tokenization is performed aswell.

    :param s the string (tweet) to clean
    :return: returns a list of cleaned tokens
    """
    s = ' '.join(s.replace('[NEWLINE]', '').split())
    s = ' '.join(s.replace('…', '...').split())
    s = urlregex.sub('', s).strip()
    s = s.translate(str.maketrans('', '', punctuation + string.digits \
                                  + emojis)).strip()
    s = ' '.join(s.split())
    s = s.lower()
    s = tokenizer.tokenize(s)
    s = [w for w in s if w not in stop_words]
    return s if s else None

In [41]:
tokenized = df['tweet'].apply(lambda x: tokenizer.tokenize(x))

In [42]:
V = set(chain.from_iterable(tokenized))

In [43]:
cleaned = [clean(w) for w in V]
V = [word[0] for word in cleaned if word]

In [49]:
len(V)

260580

## Pickle

In [47]:
#dump
with open('vocabulary_naive.pickle', 'wb') as f:
    pickle.dump(V, f)

In [5]:
#load

f = open('vocabulary_naive.pickle', 'rb')
V = pickle.load(f)
f.close()

## TF-IDF

In [41]:
def tfidf(doc1, doc2, V, tweets):
    """
    d1,d2 -> list of tokens
    """
    #i think we can use this intersection here because words that are in the in one 
    #vector but not in the other result in zero multiplication
    intersect = set(doc1).intersection(set(doc2))
    c1 = Counter(doc1)#token count
    c2 = Counter(doc2)#token count
    d1 = {}
    d2 = {}
    for t in doc1:
        if t not in d1.keys() and t in intersect:
            #tf idf
            tf = c1[t]
            df = len([tweet for tweet in tweets if t in tweet]) #naive string matching
            # FIXME: Length of df could be zero
            idf = len(V) / df
            tfidf = (1 + math.log10(tf)) * (math.log10(idf))
            d1[t] = tfidf
    for t in doc2:
        # FIXME If this statement returns False, `df_tfidf` will be undeclared
        # when we exit this loop
        if t not in d2.keys() and t in intersect:
            #tf idf
            tf = c2[t]
            
            #naive string matching
            #TODO: after tokenization and preprocessing was improved 
            #we should look if the token is in a tweet by comparing single tokens and not 
            #the whole string
            df = len([tweet for tweet in tweets if t in tweet])
            # FIXME: Length of `df` could be zero
            idf = len(V) / df
            tfidf = (1 + math.log10(tf)) * (math.log10(idf))
            d2[t] = tfidf
            
            df_tfidf = pd.DataFrame().from_dict(d1, orient='index')
            df_tfidf[1] = pd.DataFrame().from_dict(d2, orient='index')
    return df_tfidf

In [42]:
def cosine(vec1, vec2):
    nominator = 0
    denominator = 0
    vec1_length = 0
    vec2_length = 0
    for v1,v2 in zip(vec1,vec2):
        nominator += v1*v2
        vec1_length += v1*v1
        vec2_length += v2*v2
    vec1_length = math.sqrt(vec1_length)
    vec2_length = math.sqrt(vec2_length)
    denominator = vec1_length * vec2_length 
    return nominator / denominator

### Examples
Execute the example docs you want to compare.

#### 1

In [32]:
#@Brandon: its fun to play around with the those documents
doc1 = "this is a random tweet Hausarzt Affe Affe Affe".split()
doc2 = "this is random a a a a a tweet Hausarzt Hausarzt Hausarzt Hausarzt Hausarzt I think bla foo Affe".split()

#### 2

In [37]:
doc1 = "i don't think society understands how hurtful it is when this kind of behavior by the POTUS becomes an accepted form of political discourse".split()
doc2 = 'And it is grievously hurtful to our society when vilification becomes an accepted form of political debate and negative campaigning becomes a full-time occupation.'.split()

#### 3

In [34]:
doc1 = "He was a sk8er boi, she said see you later, boy".split()
doc2 = "I'm with the sk8er boi, I said see you later, boy".split()

#### 4

In [43]:
doc1 = 'and she told me Ich sitze noch in der Küche'.split()
doc2 = 'Was meinst du mit sitting here with nachos'.split()

### Compute scores

In [44]:
df_tfidf = tfidf(doc1, doc2, V, tweets)
df_tfidf

ValueError: Wrong number of items passed 0, placement implies 1

In [None]:
cosine(df_tfidf[0], df_tfidf[1])