In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
from nltk.tokenize import word_tokenize as nlkt_tokenize

In [2]:
def replace_specials(text):
    
    specials = {
        '\u0026gt;' : '>',
        '\u0026lt;' : '<',
        '\u0026amp;' : '&',
        '\\n' : ' ',
        '\\r' : ' ',
        '\\"' : '"'
        
    }
    
    for special in specials.keys():
        text = text.replace(special, specials[special])
    
    return text

In [3]:
def naive_tokenize(text):
    return text.split(' ')

def tokenize_with_nlkt(text):
    
    text_clean = replace_specials(text)
    text_lowered = text_clean.lower()
    tokenized = nlkt_tokenize(text_lowered)
    
    return tokenized
    

In [4]:
class DataReader:
    
    def __init__(self, default_tokenize):
        
        self.default_tokenize = default_tokenize
    
    def read_data(self, filepath, tokenize = None):
        
        if tokenize is None:
            tokenize = self.default_tokenize
        
        data = []
        
        my_file = open(filepath, 'r')
        for line in tqdm(my_file.readlines()):
            data.append(tokenize(line))
        
        return data
    
    def read_tags(self, filepath):
        
        tags = []
        
        my_file = open(filepath, 'r')
        for line in tqdm(my_file.readlines()):
            tags.append(int(line))
        
        return tags

In [5]:
dr = DataReader(tokenize_with_nlkt)
train_1 = dr.read_data('data/train_text_1.txt')
tags_train_1 = dr.read_tags('data/train_tags_1.txt')
train_2 = dr.read_data('data/train_text_2.txt')
tags_train_2 = dr.read_tags('data/train_tags_2.txt')

test_1 = dr.read_data('data/test_text_1.txt')
tags_test_1 = dr.read_tags('data/test_tags_1.txt')
test_2 = dr.read_data('data/test_text_2.txt')
tags_test_2 = dr.read_tags('data/test_tags_2.txt')



100%|██████████████████████████████████| 10041/10041 [00:00<00:00, 11044.23it/s]
100%|████████████████████████████████| 10041/10041 [00:00<00:00, 3557010.68it/s]
100%|██████████████████████████████████| 10041/10041 [00:00<00:00, 11535.13it/s]
100%|████████████████████████████████| 10041/10041 [00:00<00:00, 4290444.83it/s]
100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 10890.42it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 3572660.99it/s]
100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 10979.91it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 3350083.07it/s]


In [6]:
class AbstractClassifier:
    
    def __init__(self, classifier):
        
        self.classifier = classifier
    
    def classify(self, tweet):
        
        return np.argmax(self.classifier.tweet_class_distribution(tweet))
            
    def classify_many(self, tweets):
        
        answer = [self.classify(tweet) for tweet in tweets]
        
        return answer
    
    def run_and_save(self, tweets, filename):
        
        answer = self.classify_many(tweets)
        
        f = open(filename, 'w+')
        for ans in answer:
            f.write(str(ans) + '\n')

In [7]:
class WordCounter:
    
    def __init__(self, data, classes, no_classes = 2):
    
        counter = defaultdict(int)
        class_counter = []
        for i in range(no_classes):
            class_counter.append(defaultdict(int))
        
        wordset = []
    
        for tweet, tweet_class in tqdm(zip(data, classes)):
            for word in tweet:
                counter[word] += 1
                class_counter[tweet_class][word] += 1
                wordset.append(word)
        
        self.wordset = list(set(wordset))
        
        self.no_classes = no_classes
        self.counter = counter
        self.class_counter = class_counter
        
        self.save_most_dominant()
        
    def ask(self, word):
        
        total = self.counter[word]
        classes = [self.class_counter[i][word] for i in range(self.no_classes)]
        return (total, np.array(classes))
    
    def ask_distribution(self, word):
        
        total, over_classes = self.ask(word)
        scaled_over_classes = np.array(over_classes, dtype = float)
        scaled_over_classes /= float(total)
        
        return scaled_over_classes

    def save_most_dominant(self, K = 25):
        
        most_dominant = []
        for i in range(self.no_classes):
            most_dominant.append([])
        
        for word in self.wordset:
            
            scaled_over_classes = self.ask_distribution(word)
            
            for i in range(self.no_classes):
                if scaled_over_classes[i] < 0.99 and scaled_over_classes[i] > (1 / self.no_classes):
                    most_dominant[i].append((scaled_over_classes[i], word))
        
        self.keywords = []
        
        for i in range(self.no_classes):
            most_dominant[i] = sorted(most_dominant[i], reverse = True)
            if len(most_dominant[i]) > K:
                most_dominant[i] = most_dominant[i][:K]
            print(most_dominant[i])
            
            self.keywords.append([])
            for frac, word in most_dominant[i]:
                self.keywords[i].append(word)
            
            print(self.keywords[i])
    
    def tweet_class_distribution(self, tweet):
        
        
        res = np.ones(self.no_classes)
        res[0] = 1.01
        
        my_keywords = []
        for i in range(1, self.no_classes):
            my_keywords += self.keywords[i]
        
        my_keywords = list(set(my_keywords))
        
        
        for word in tweet:
            #print(word)
            if word in my_keywords:
                
                weights = 1 + self.ask_distribution(word)
                res *= weights
        
        return res
        
            
        
    
    


In [8]:
wc1 = WordCounter(train_1, tags_train_1, no_classes = 2)
wc2 = WordCounter(train_2, tags_train_2, no_classes = 3)

10041it [00:00, 249743.56it/s]


[(0.9893617021276596, 'chodzi'), (0.9886363636363636, 'p'), (0.9876543209876543, 'jeśli'), (0.9864864864864865, 'wisły'), (0.9857142857142858, 'wtedy'), (0.9857142857142858, 'mamy'), (0.9856115107913669, 'jestem'), (0.9847328244274809, 'mam'), (0.9831932773109243, 'było'), (0.9818181818181818, 'jednak'), (0.9807692307692307, 'zaraz'), (0.9807692307692307, 'temu'), (0.9807692307692307, 'oczywiście'), (0.9803921568627451, 'sumie'), (0.9803921568627451, 'będę'), (0.9803779069767442, ':'), (0.98, 'jakie'), (0.98, 'dobry'), (0.9795918367346939, 'xd'), (0.9795918367346939, 'pracy'), (0.9795918367346939, 'grał'), (0.9787234042553191, 'skoro'), (0.9787234042553191, 'razie'), (0.9777777777777777, 'wszystkich'), (0.9777777777777777, 'mln')]
['chodzi', 'p', 'jeśli', 'wisły', 'wtedy', 'mamy', 'jestem', 'mam', 'było', 'jednak', 'zaraz', 'temu', 'oczywiście', 'sumie', 'będę', ':', 'jakie', 'dobry', 'xd', 'pracy', 'grał', 'skoro', 'razie', 'wszystkich', 'mln']
[(0.8571428571428571, 'pisowska'), (0.85

10041it [00:00, 251724.42it/s]

[(0.9893617021276596, 'chodzi'), (0.9886363636363636, 'p'), (0.9876543209876543, 'jeśli'), (0.9864864864864865, 'wisły'), (0.9857142857142858, 'wtedy'), (0.9857142857142858, 'mamy'), (0.9856115107913669, 'jestem'), (0.9847328244274809, 'mam'), (0.9831932773109243, 'było'), (0.9818181818181818, 'jednak'), (0.9807692307692307, 'zaraz'), (0.9807692307692307, 'temu'), (0.9807692307692307, 'oczywiście'), (0.9803921568627451, 'sumie'), (0.9803921568627451, 'będę'), (0.9803779069767442, ':'), (0.98, 'jakie'), (0.98, 'dobry'), (0.9795918367346939, 'xd'), (0.9795918367346939, 'pracy'), (0.9795918367346939, 'grał'), (0.9787234042553191, 'skoro'), (0.9787234042553191, 'razie'), (0.9777777777777777, 'wszystkich'), (0.9777777777777777, 'mln')]
['chodzi', 'p', 'jeśli', 'wisły', 'wtedy', 'mamy', 'jestem', 'mam', 'było', 'jednak', 'zaraz', 'temu', 'oczywiście', 'sumie', 'będę', ':', 'jakie', 'dobry', 'xd', 'pracy', 'grał', 'skoro', 'razie', 'wszystkich', 'mln']
[(0.75, 'wzór'), (0.6666666666666666, 'z




In [9]:
wc_classifier_1 = AbstractClassifier(wc1)
wc_classifier_2 = AbstractClassifier(wc2)

In [10]:
wc_classifier_1.run_and_save(test_1, 'data/answers1wc.txt')
wc_classifier_2.run_and_save(test_2, 'data/answers2wc.txt')

In [11]:
class NaiveBayes:
    
    def __init__(self, word_counter):

        
        self.no_classes = word_counter.no_classes
        self.wordset = word_counter.wordset
        
        counts = []
        
        for word in word_counter.wordset:
            counts.append(word_counter.ask(word)[1])
        
        counts = np.array(counts)
        
        self.df = pd.DataFrame(index = word_counter.wordset,
                          columns = np.arange(word_counter.no_classes), data = counts)
        
        self.df /= self.df.sum(0)
        
        self.log_df = np.log(1e-100 + self.df)
    
    def tweet_class_distribution(self, tweet):
        
        log_probs = np.zeros(self.no_classes)
    
        apriori_prob = -np.log(self.no_classes)
        prob_d = 0

        for cur_class in range(self.no_classes):
            cur_prob = apriori_prob
            for word in tweet:
                if word not in self.wordset:
                    continue
                cur_prob += (self.log_df.loc[word, cur_class])
            log_probs[cur_class] = cur_prob
            prob_d += np.exp(cur_prob)
    
        prob_d = np.log(prob_d)
    
        probs = np.zeros(self.no_classes)
        
        for cur_class in range(self.no_classes):
            p = log_probs[cur_class] - prob_d
            probs[cur_class] = np.exp(p)
        
        return probs

In [12]:
nb1 = NaiveBayes(wc1)
nb2 = NaiveBayes(wc2)

In [13]:
nb_classifier_1 = AbstractClassifier(nb1)
nb_classifier_2 = AbstractClassifier(nb2)

In [14]:
nb_classifier_1.run_and_save(test_1, 'data/answers1nb.txt')
nb_classifier_2.run_and_save(test_2, 'data/answers2nb.txt')

In [18]:
! perl data/evaluate1.pl answers1wc.txt

Cannot open! at data/evaluate1.pl line 13.
