In [None]:
from data_reader import *
from abstract_classifier import AbstractClassifier

In [None]:
dr = DataReader(tokenize_with_nlkt)
train_1 = dr.read_data('data/train_text_1.txt')
tags_train_1 = dr.read_tags('data/train_tags_1.txt')
train_2 = dr.read_data('data/train_text_2.txt')
tags_train_2 = dr.read_tags('data/train_tags_2.txt')

test_1 = dr.read_data('data/test_text_1.txt')
tags_test_1 = dr.read_tags('data/test_tags_1.txt')
test_2 = dr.read_data('data/test_text_2.txt')
tags_test_2 = dr.read_tags('data/test_tags_2.txt')



In [None]:
class WordCounter:
    
    def __init__(self, data, classes, no_classes = 2):
    
        counter = defaultdict(int)
        class_counter = []
        for i in range(no_classes):
            class_counter.append(defaultdict(int))
        
        wordset = []
    
        for tweet, tweet_class in tqdm(zip(data, classes)):
            for word in tweet:
                counter[word] += 1
                class_counter[tweet_class][word] += 1
                wordset.append(word)
        
        self.wordset = list(set(wordset))
        
        self.no_classes = no_classes
        self.counter = counter
        self.class_counter = class_counter
        
        self.save_most_dominant()
        
    def ask(self, word):
        
        total = self.counter[word]
        classes = [self.class_counter[i][word] for i in range(self.no_classes)]
        return (total, np.array(classes))
    
    def ask_distribution(self, word):
        
        total, over_classes = self.ask(word)
        scaled_over_classes = np.array(over_classes, dtype = float)
        scaled_over_classes /= float(total)
        
        return scaled_over_classes

    def save_most_dominant(self, K = 25):
        
        most_dominant = []
        for i in range(self.no_classes):
            most_dominant.append([])
        
        for word in self.wordset:
            
            scaled_over_classes = self.ask_distribution(word)
            
            for i in range(self.no_classes):
                if scaled_over_classes[i] < 0.99 and scaled_over_classes[i] > (1 / self.no_classes):
                    most_dominant[i].append((scaled_over_classes[i], word))
        
        self.keywords = []
        
        for i in range(self.no_classes):
            most_dominant[i] = sorted(most_dominant[i], reverse = True)
            if len(most_dominant[i]) > K:
                most_dominant[i] = most_dominant[i][:K]
            print(most_dominant[i])
            
            self.keywords.append([])
            for frac, word in most_dominant[i]:
                self.keywords[i].append(word)
            
            print(self.keywords[i])
    
    def tweet_class_distribution(self, tweet):
        
        
        res = np.ones(self.no_classes)
        res[0] = 1.01
        
        my_keywords = []
        for i in range(1, self.no_classes):
            my_keywords += self.keywords[i]
        
        my_keywords = list(set(my_keywords))
        
        
        for word in tweet:
            #print(word)
            if word in my_keywords:
                
                weights = 1 + self.ask_distribution(word)
                res *= weights
        
        return res
        
            
        
    
    


In [None]:
wc1 = WordCounter(train_1, tags_train_1, no_classes = 2)
wc2 = WordCounter(train_2, tags_train_2, no_classes = 3)

In [None]:
wc_classifier_1 = AbstractClassifier(wc1)
wc_classifier_2 = AbstractClassifier(wc2)

In [None]:
wc_classifier_1.run_and_save(test_1, 'data/answers1wc.txt')
wc_classifier_2.run_and_save(test_2, 'data/answers2wc.txt')

In [None]:
class NaiveBayes:
    
    def __init__(self, word_counter):

        
        self.no_classes = word_counter.no_classes
        self.wordset = word_counter.wordset
        
        counts = []
        
        for word in word_counter.wordset:
            counts.append(word_counter.ask(word)[1])
        
        counts = np.array(counts)
        
        self.df = pd.DataFrame(index = word_counter.wordset,
                          columns = np.arange(word_counter.no_classes), data = counts)
        
        self.df /= self.df.sum(0)
        
        self.log_df = np.log(1e-100 + self.df)
    
    def tweet_class_distribution(self, tweet):
        
        log_probs = np.zeros(self.no_classes)
    
        apriori_prob = -np.log(self.no_classes)
        prob_d = 0

        for cur_class in range(self.no_classes):
            cur_prob = apriori_prob
            for word in tweet:
                if word not in self.wordset:
                    continue
                cur_prob += (self.log_df.loc[word, cur_class])
            log_probs[cur_class] = cur_prob
            prob_d += np.exp(cur_prob)
    
        prob_d = np.log(prob_d)
    
        probs = np.zeros(self.no_classes)
        
        for cur_class in range(self.no_classes):
            p = log_probs[cur_class] - prob_d
            probs[cur_class] = np.exp(p)
        
        return probs

In [None]:
nb1 = NaiveBayes(wc1)
nb2 = NaiveBayes(wc2)

In [None]:
nb_classifier_1 = AbstractClassifier(nb1)
nb_classifier_2 = AbstractClassifier(nb2)

In [None]:
nb_classifier_1.run_and_save(test_1, 'data/answers1nb.txt')
nb_classifier_2.run_and_save(test_2, 'data/answers2nb.txt')