In [31]:
import sys
import csv
import string
import re
import emoji
import nltk
#nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from spell_checker import SpellChecker

In [32]:
class Index:
    """
    This data structure is the value of the indices dictionary.
    """
    def __init__(self, size, pointer2postingsList):
        #size of the postings list
        self.size = size
        #pointer to the head of the postings list
        self.pointer2postingsList = pointer2postingsList

In [33]:
class PostingNode:
    """
    Linked list for the postings list
    """
    def __init__(self, val):
        self.val = val
        self.next = None

In [40]:
class TwitterIR(object):
    """
    Main Class for the information retrieval task.
    """
    __slots__ = 'id2doc', 'tokenizer', 'unicodes2remove', 'indices', \
    'urlregex', 'punctuation', 'emojis', 'stop_words', 'englishdic', 'germandic', \
    'engSpellCheck', 'gerSpellCheck'

    def __init__(self):
        #the original mapping from the id's to the tweets, 
        #which is kept until the end to index the tweets
        self.id2doc = {}
        self.tokenizer = TweetTokenizer()
        #bunch of punctuation unicodes which are not in 'string.punctuation'
        self.unicodes2remove = [
            #all kinds of quotes
            u'\u2018', u'\u2019', u'\u201a', u'\u201b', u'\u201c',\
            u'\u201d', u'\u201e', u'\u201f', u'\u2014',
            #all kinds of hyphens
            u'\u002d', u'\u058a', u'\u05be', u'\u1400', u'\u1806',\
            u'\u2010', u'\u2011', u'\u2012', u'\u2013',
            u'\u2014', u'\u2015', u'\u2e17', u'\u2e1a', u'\u2e3a',\
            u'\u2e3b', u'\u2e40', u'\u301c', u'\u3030',
            u'\u30a0', u'\ufe31', u'\ufe32', u'\ufe58', u'\ufe63',\
            u'\uff0d'
        ]
        #the resulting datastructure which has the tokens as keys
        #and the Index objects as values
        self.indices = {}
        #regex to match urls (taken from the web)
        self.urlregex = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]'
                                   '|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        #keep @ to be able to recognize usernames
        self.punctuation = string.punctuation.replace('@', '') + \
        ''.join(self.unicodes2remove)
        self.punctuation = self.punctuation.replace('#', '')
        #a bunch of emoji unicodes
        self.emojis = ''.join(emoji.UNICODE_EMOJI)
        self.emojis = self.emojis.replace('#', '')
        #combined english and german stop words
        self.stop_words = set(stopwords.words('english') + stopwords.words('german'))
        self.englishdic = set()
        self.germandic = set()
        

    def initId2doc(self, path):
        """
        Reads the file in and fills the id2doc datastructure.
        :param path: path to the tweets.csv file
        :return:
        """
        with open(path, 'r', encoding='utf-8', newline='') as f:
            r = csv.reader(f, delimiter='\t')
            for line in r:
                self.id2doc[line[1]] = line[4]
        f.close()
    
    def initLanguageDics(self, path_dir, filenameEN='englishdic.sec', filenameGER='germandic.sec'):
        if filenameEN:
            with open(path_dir+filenameEN, 'r', encoding='utf-8') as f:
                for word in f.readlines():
                    self.englishdic.add(word.strip())
                f.close()
            
        if filenameGER:
            with open(path_dir+filenameGER, 'r', encoding='iso-8859-1') as f:
                for word in f.readlines():
                    self.germandic.add(word.strip())
                f.close()
                
    def _initSpellCheck(self, dic_path):
        return SpellChecker(open(dic_path).read().splitlines(),
                fdist={term: node.size for (term, node) in self.indices.items()}).spell_check



    def __len__(self):
        return len(self.indices.keys())
                

    def clean(self, s):
        """
        Normalizes a string (tweet) by removing the urls, punctuation, digits,
        emojis, by putting everything to lowercase and removing the
        stop words. Tokenization is performed aswell.
        
        :param s the string (tweet) to clean
        :return: returns a list of cleaned tokens
        """
        s = self.urlregex.sub('', s).strip()
        s = s.translate(str.maketrans('', '', self.punctuation + string.digits \
                                      + self.emojis)).strip()
        s = ' '.join(s.split())
        s = s.lower()
        s = self.tokenizer.tokenize(s)
        s = [w for w in s if w not in self.stop_words]
        return s

    def index(self, path):
        """
        1) call the method to read the file in
        2) iterate over the original datastructure id2doc which keeps the mapping
        of the tweet ids to the actual tweets and do:
            2a) preprocessing of the tweets
            2b) create a mapping from each token to its postings list (tokens2id)
        3) iterate over the just created mapping of tokens to their respective 
        postings lists (tokens2id) and do:
            3a) calculate the size of the postingslist
            3b) sort the postings list numerically in ascending order
            3c) create a linked list for the postings list
            3d) create the Index object with the size of the postings list and
            the pointer to the postings list - add to the resulting datastructure 
        :param path: the path to the tweets.csv file
        :return:
        """
        self.initId2doc(path)
        tokens2id = {}
        for id,doc in self.id2doc.items():
            doc = self.clean(doc)
            for t in doc:
                if t in tokens2id.keys():
                    tokens2id[t].add(id)
                else:
                    #a set is used to avoid multiple entries of the same tweetID
                    tokens2id[t] = {id}

        for t,ids in tokens2id.items():
            #size of the postings list which belongs to token t
            size = len(ids)
            #sort in ascending order
            ids = sorted(ids)
            #use the first (and smallest) tweetID to be the head node of the 
            #linked list
            node = PostingNode(ids[0])
            #keep reference to the head of the linked list since node variable
            #is going to be overridden
            pointer = node
            for id in ids[1:]:
                #create further list items
                n = PostingNode(id)
                #and append to the linked list
                node.next = n
                #step further
                node = n
            #create the index object with size of the postings list 
            #and a link to the postings list itself
            i = Index(size, pointer)
            self.indices[t] = i
            
        self.engSpellCheck = self._initSpellCheck('englishdic.sec')
        self.gerSpellCheck = self._initSpellCheck('germandic-utf8.sec')
        
    def _detectLanguage(self, context):
        tokens = self.tokenizer.tokenize(context)
        stopsEN = [token for token in tokens if token in stopwords.words('english')]
        stopsDE = [token for token in tokens if token in stopwords.words('german')]
        if len(stopsEN) > len(stopsDE):
            return 'english'
        elif len(stopsDE) > len(stopsEN):
            return 'german'
        else:
            cleaned = self.clean(doc)

            wordsEN = []
            wordsDE = []
            for token in cleaned:
                if token in englishdic:
                    wordsEN.append(token)
                if token in germandic:
                    wordsDE.append(token)
            if len(wordsEN) > len(wordsDE):
                return 'english'
            elif len(wordsDE) > len(wordsEN):
                return 'german'
            else:
                # FIXME
                from random import choice 
                return choice(('english', 'german'))

    def _query(self, term):
        """
        Internal method to query for one term.
        :param: term the word which was queried for 
        :return: returns the Index object of the corresponding query term
        """
        if term in self.indices:
            return self.indices[term]
        return Index(0, PostingNode(''))
    
    def spellCheck(self, term, context):
        return {'english': self.engSpellCheck,
                'german': self.gerSpellCheck}[self._detectLanguage(context)](term)

    def query(self, *arg):
        """
        Query method which can take any number of terms as arguments.
        It uses the internal _query method to get the postings lists for the single 
        terms. It calculates the intersection of all postings lists.
        :param *arg term arguments
        :return: returns a list of tweetIDs which all contain the query terms
        """
        #at this point it's a list of Index objects
        pointers = [self._query(t) for t in arg if t not in self.stop_words]
        #here the Index objects get sorted by the size of the 
        #postings list they point to
        pointers = sorted(pointers, key=lambda i: i.size)
        #here it becomes a list of pointers to the postings lists
        pointers = [i.pointer2postingsList for i in pointers]
        #first pointer
        intersection = pointers[0]
        #step through the pointers
        for p in pointers[1:]:
            #intersection between the new postings list and the so far
            #computed intersection
            intersection = self.intersect(intersection, p)
            #if at any point the intersection is empty there is 
            #no need to continue
            if not intersection:
                return []
        #convert the resulting intersection to a normal list
        rval = []
        pointer = intersection
        while pointer:
            rval.append(pointer.val)
            pointer = pointer.next
        return rval
    
    def intersect(self, pointer1, pointer2):
        """
        Computes the intersection for two postings lists.
        :param pointer1: first postings list
        :param pointer2: second postings list
        :return: returns the intersection 
        """
        #create temporary head node
        node = PostingNode('tmp')
        #keep reference to head node
        rvalpointer = node
        while pointer1 and pointer2:
            val1 = pointer1.val
            val2 = pointer2.val
            #only append to the linked list if the values are equal
            if val1 == val2:
                n = PostingNode(val1)
                node.next = n
                node = n
                pointer1 = pointer1.next
                pointer2 = pointer2.next
            #otherwise the postings list with the smaller value 
            #at the current index moves one forward
            elif val1 > val2:
                pointer2 = pointer2.next
            elif val1 < val2:
                pointer1 = pointer1.next
        #return from the second element on since the first was the temporary one
        return rvalpointer.next



In [35]:
twitterIR = TwitterIR()
twitterIR.index('tweets.csv')

In [36]:
#NEW
#move this into the index function
# twitterIR.initLanguageDics('../assignment2/')

In [37]:
#NEW
#twitterIR.englishdic
#twitterIR.germandic
#list(map(str.lower, twitterIR.englishdic))

In [44]:
#NEW
print(twitterIR.engSpellCheck('hippi'))
print(twitterIR.gerSpellCheck('heit'))
print(twitterIR.spellCheck('hott', 'It was such a hott summer day.'))
print(twitterIR.spellCheck('hott', 'Er hott mir etwas zu trinken.'))

# sc = SpellChecker(open('englishdic.sec', 'r').read().splitlines(),
#                   fdist={term: node.size for (term, node) in twitterIR.indices.items()})
# sc.spell_check('hellr')

hippo
zeit
hot
holt


In [182]:
#NEW
#light weight language detection mainly based on stopwords
stopEN = stopwords.words('english')
stopDE = stopwords.words('german')

englishdic = list(map(str.lower, twitterIR.englishdic))
germandic = list(map(str.lower, twitterIR.germandic))

i = 0
for id,doc in twitterIR.id2doc.items():
    if i == 100:
        sys.exit()
    tokens = twitterIR.tokenizer.tokenize(doc)
    stopsEN = [token for token in tokens if token in stopEN]
    stopsDE = [token for token in tokens if token in stopDE]
    print(doc)
    if len(stopsEN) > len(stopsDE):
        print("--english--")
    elif len(stopsDE) > len(stopsEN):
        print("--deutsch--")
    else:
        #cleaned = twitterIR.clean(doc)
        cleaned = [token.lower() for token in tokens if token[0] not in ['@', '#']]
        print(cleaned)
        print("Hmmm...")
        print(stopsEN)
        print(stopsDE)
        #if (not stopsEN) and (not stopsDE):
        wordsEN = []
        wordsDE = []
        for token in cleaned:
            if token in englishdic:
                wordsEN.append(token)
            if token in germandic:
                wordsDE.append(token)
        if len(wordsEN) > len(wordsDE):
            print('--english--')
        elif len(wordsDE) > len(wordsEN):
            print('--german--')
        else:
            #we still need to handle this case...any ideas?
            print('WTF again?')
            print(wordsEN)
            print(wordsDE)
    #print(stopsEN)
    #print(stopsDE)
    print()
    i+= 1
    #sys.exit()

@knakatani @ChikonJugular @joofford @SteveBlogs11 https://t.co/WHtaRYGNSY says lifetime risk of cervical cancer in Japan is 1 in 100.  That means HPV is endemic in Japan, and screening is not working well.
--english--

@FischerKurt Lady, what´s a tumor? #KippCharts
--english--

@Kings_of_Metal Ohne Diagnoseverdacht ist es nunmal schwer, gerade für einen Hausarzt. Am Blutbild kann man meist nicht viel sehen, gerade wenn man nicht auch die Hormone überprüft. Nicht erklärbare Gewichtseinlagerungen können ja alles sein, von Wasser, Fett, Kind bis hin zum Tumor.
--deutsch--

@GermanLetsPlay @Quentin34013799 @_Lopoopl_ @LeVanni_ @igeloe @Annelle1805 Glückwunsch😄🎉❤
['glückwunsch', '😄', '🎉', '❤']
Hmmm...
[]
[]
--german--

Interesting. ⬆️ pCR rate at major centers. Authors argue with ⬆️ treatment compliance at major centers. We see the same in our database. I think it’s rather due to earlier detection, smaller tumors ➡️ more pCR. Will look deeper into this. #crcsm https://t.co/QfL5g2Z5u9
--engl

--english--

Game <b>Mode</b> Idea https://t.co/GnWBVXNWh8
['game', '<b>', 'mode', '</b>', 'idea', 'https://t.co/gnwbvxnwh8']
Hmmm...
[]
[]
--english--

Wenn eine neue Bundesstraße Lebenspläne zerstört https://t.co/9O2hJXyg4y #Hamburg
--deutsch--

Dein Engagement für Hamburg 2018! https://t.co/4WxqgZ1s9E #Hamburg
--deutsch--

Vermisster Schotte Spürhund sucht am Michel nach Liam https://t.co/45LcHSIas5 #Hamburg
--deutsch--

Digimon Story: Cyber Sleuth – Hacker's Memory – Abgedrehtes Abenteuer zwischen Cyberspace… https://t.co/NBe0hxzuLc
--deutsch--

@Tommy_Potti hey potti I am a big fan since early cs. I am always watching your yt pubg videos and I am just wondering what your pubg video settings are and your FOV cuz your image looks so crispy and clean. pls DM would be so nice :) ty
--english--

@sunchayn_ NSJJSJS ILYSM you are so cute and adorable I’m glad that i met you here 💕and I joined asd last time guess you just didn’t see it
--english--

Funk Rauchwarnmelder - ASD-10QR / Instal

SystemExit: 

In [102]:
#twitterIR.punctuation
#for l in twitterIR.indices.keys():
#    if l[0] == '#':
#        print(l)

In [103]:
#this is the proof that the positings list is sorted correctly!
for l in twitterIR.indices.values():
    
    pointer = l.pointer2postingsList

    val = '0'
    while pointer.next:
        #print(pointer.val)
        if pointer.val > val:
            val = pointer.val
        else:
            print("Got you!")
        pointer = pointer.next


In [33]:
#get top three occurances across the tweets and their corresponding 
#postings list length
#stop words are removed otherwise they would probably
#be the top words
#the result is achieved by sorting the postings list by their size and then taking the corresponding 
#term which the postings list belongs to
l = sorted(twitterIR.indices.items(), key=lambda i: i[1].size, reverse=True)[:3]
for el in l:
    print('token: ',el[0], '\tlength of the postings list: ',el[1].size)

token:  happy 	length of the postings list:  8647
token:  love 	length of the postings list:  8614
token:  mehr 	length of the postings list:  5592


In [18]:
index = twitterIR.query('nacht', 'schlafen')
print(len(index), ' tweets were found for this query')
print('---------------------------------------------')
print()
for id in index:
    print(id)
    print(twitterIR.id2doc[id])
    print()

22  tweets were found for this query
---------------------------------------------

1003063075333922821
einfach befriedigend mitten in der nacht aus dem offenen fenster zu sehen und zu wissen dass gerade alle schlafen, als wär man der einzige mensch den es gerade gibt

1009499399972642816
@the_necrosis @robin_urban Mein Betreuer wusste grob wegen #CRD und #non24h Bescheid. Wann waren die Termine mit ihm? Später vormittag. Das hat mich so gestresst, dass ich die Nacht vorher noch später erst schlafen konnte als eh schon. (War ja ohne #orphanmedi)[NEWLINE]Auch sicher keine #Inklusion.

1011758043841916930
ich kann die letzten tage wieder besser schlafen und es ist so entlastend, mal mehr als vier stunden pro nacht zu schlafen obwohl man eigentlich ausschlafen könnte

1012080647228272640
Die Idee vor dem Schlafen gehen noch einen Fitzek anzufangen, war eher nicht so eine meiner Besten. Das werden sicher richtig schöne (Alp-)Träume heute Nacht👌

960632815414075392
@despacitolea @Polwnn @So