In [76]:
import pandas as pd
import string
from math import log10
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

In [34]:
COLUMN_LABELS = ['Game Name', 'Class', 'Title', 'Review Text']
train = pd.read_csv('games-train.csv', sep='\t', names=COLUMN_LABELS)
test = pd.read_csv('games-test.csv', sep='\t', names=COLUMN_LABELS)

In [43]:
tokenize = TweetTokenizer(reduce_len=True).tokenize

In [44]:
def preprocess(doc):
    doc = str(doc)
    doc = doc.lower()
    doc = doc.translate(str.maketrans('', '', string.punctuation)).strip()
    doc = tokenize(doc)
    doc = [token for token in doc if token not in stopwords.words('german')]
    return doc

In [31]:
def estimate_parameters(docs, collection_size): # docs = docs belonging to one class
    p_y = len(docs) / collection_size
    count = Counter()
    for doc in docs:
        count.update(preprocess(doc))
        
    return (p_y, count)

In [110]:
def predict(test_doc, parameters):
    test_doc = preprocess(test_doc)
    r = [None, 1]
    for class_, params in parameters.items():
        tokens_prob = 0
        p_y = params[0]
        counter = params[1]
        print(class_)
        for token in test_doc:
            token_rel_freq = counter[token] / sum(counter.values())
            if token_rel_freq == 0:
                continue
            tokens_prob -= log10(token_rel_freq)
            
        class_prob = log10(p_y) + tokens_prob
        print(class_prob)
        if r[1] > class_prob:
            r[0] = class_
            r[1] = class_prob
            
    return r

In [45]:
good = estimate_parameters(train[train['Class'] == 'gut']['Review Text'], len(train))
bad = estimate_parameters(train[train['Class'] == 'schlecht']['Review Text'], len(train))

In [61]:
params = {'good': good, 'bad': bad}

In [59]:
print(good[0], bad[0])

0.8230904656534169 0.1769095343465831


In [101]:
good[1].most_common(10)

[('spiel', 33871),
 ('cool', 16236),
 ('macht', 13632),
 ('super', 11447),
 ('geil', 9955),
 ('gut', 9503),
 ('einfach', 8996),
 ('spaß', 8329),
 ('echt', 5912),
 ('immer', 4793)]

In [60]:
bad[1].most_common(20)

[('spiel', 9424),
 ('mehr', 6483),
 ('seit', 3275),
 ('bitte', 3263),
 ('immer', 3176),
 ('update', 3092),
 ('mal', 2639),
 ('geht', 2309),
 ('beheben', 2143),
 ('schon', 1973),
 ('sterne', 1882),
 ('ab', 1755),
 ('macht', 1600),
 ('spielen', 1530),
 ('5', 1383),
 ('komme', 1367),
 ('echt', 1287),
 ('rein', 1212),
 ('kommt', 1203),
 ('stürzt', 1183)]

In [109]:
predict('das Spiel stürtzt immer ab. bitte schnell beheben', params)

good
17.614109932971658
bad
14.997678405286155


[None, 1]

In [82]:
test.head()

Unnamed: 0,Game Name,Class,Title,Review Text
0,Farmville 2,schlecht,,"Echt schlecht , immer wen ich versuche zu star..."
1,Die Simpsons,gut,Buchi0202136,Suche noch freunde zum hinzufuegen
2,Die Simpsons,gut,Suchtgefähr :) !!,"Ich find das Spiel gut,man muss nicht permanen..."
3,Die Simpsons,gut,Dauerhafter Spaß...,... durch immer neue Events. Schon 1 1/2 Jahre...
4,Subway Surfers,gut,Great,I like the game but near the last update it st...


In [84]:
test['Review Text'].head().apply(lambda x: predict(x, params))

0    [None, 0]
1    [None, 0]
2    [None, 0]
3    [None, 0]
4    [None, 0]
Name: Review Text, dtype: object