In [76]:
import pandas as pd
import string
from math import log10
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

In [34]:
COLUMN_LABELS = ['Game Name', 'Class', 'Title', 'Review Text']
train = pd.read_csv('games-train.csv', sep='\t', names=COLUMN_LABELS)
test = pd.read_csv('games-test.csv', sep='\t', names=COLUMN_LABELS)

In [43]:
tokenize = TweetTokenizer(reduce_len=True).tokenize

In [44]:
def preprocess(doc):
    doc = str(doc)
    doc = doc.lower()
    doc = doc.translate(str.maketrans('', '', string.punctuation)).strip()
    doc = tokenize(doc)
    doc = [token for token in doc if token not in stopwords.words('german')]
    return doc

In [31]:
def estimate_parameters(docs, collection_size): # docs = docs belonging to one class
    p_y = len(docs) / collection_size
    count = Counter()
    for doc in docs:
        count.update(preprocess(doc))
        
    return (p_y, count)

In [131]:
def predict(test_doc, parameters):
    test_doc = preprocess(test_doc)
    probs = []
    for class_, params in parameters.items():
        tokens_prob = 0
        p_y = params[0]
        counter = params[1]
        for token in test_doc:
            token_rel_freq = counter[token] / sum(counter.values())
            if token_rel_freq == 0:
                continue
            tokens_prob += log10(token_rel_freq)
            
        class_prob = log10(p_y) + tokens_prob
        probs.append((class_, abs(class_prob)))
            
    return min(probs, key=lambda x: x[1])

In [45]:
good = estimate_parameters(train[train['Class'] == 'gut']['Review Text'], len(train))
bad = estimate_parameters(train[train['Class'] == 'schlecht']['Review Text'], len(train))

In [61]:
params = {'good': good, 'bad': bad}

In [59]:
print(good[0], bad[0])

0.8230904656534169 0.1769095343465831


In [101]:
good[1].most_common(10)

[('spiel', 33871),
 ('cool', 16236),
 ('macht', 13632),
 ('super', 11447),
 ('geil', 9955),
 ('gut', 9503),
 ('einfach', 8996),
 ('spaß', 8329),
 ('echt', 5912),
 ('immer', 4793)]

In [143]:
bad[1].most_common(10)

[('spiel', 9424),
 ('mehr', 6483),
 ('seit', 3275),
 ('bitte', 3263),
 ('immer', 3176),
 ('update', 3092),
 ('mal', 2639),
 ('geht', 2309),
 ('beheben', 2143),
 ('schon', 1973)]

In [134]:
predict('tolles Spiel', params)

('good', 3.763923847279274)

In [132]:
predict('das Spiel stürtzt immer ab. bitte schnell beheben', params)

('bad', 16.50217592654949)

In [138]:
test.head()

Unnamed: 0,Game Name,Class,Title,Review Text
0,Farmville 2,schlecht,,"Echt schlecht , immer wen ich versuche zu star..."
1,Die Simpsons,gut,Buchi0202136,Suche noch freunde zum hinzufuegen
2,Die Simpsons,gut,Suchtgefähr :) !!,"Ich find das Spiel gut,man muss nicht permanen..."
3,Die Simpsons,gut,Dauerhafter Spaß...,... durch immer neue Events. Schon 1 1/2 Jahre...
4,Subway Surfers,gut,Great,I like the game but near the last update it st...


In [142]:
%time result = test['Review Text'].apply(lambda x: predict(x, params))

CPU times: user 4min 27s, sys: 14.1 s, total: 4min 41s
Wall time: 4min 42s


In [148]:
result.iloc[:10]

0      (bad, 24.71177312969205)
1      (bad, 7.229119758952452)
2      (bad, 71.83599943884803)
3      (bad, 39.38009345437814)
4     (good, 45.68513157370194)
5      (bad, 8.004199608040208)
6    (good, 1.3246324623491077)
7      (good, 64.6410421311454)
8     (good, 41.80048084638117)
9    (good, 2.5266929912956395)
Name: Review Text, dtype: object