**Задача: определение тональности отзывов при помощи наивного байесовского классификатора.**

In [1]:
import numpy as np
import sklearn
import os
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [8]:
pos_path = "./txt_sentoken/pos/"
neg_path = "./txt_sentoken/neg/"

def convert_text(s):
    # Removes all characters from string except letters and digits and convert letters to lowercase
    return re.sub("[^a-zA-Z0-9]", " ", s.lower())

def read_txts(dir_path="./txt_sentoken/pos/"):
    # Reads all files from directory
    if dir_path[-1] != "/":
        dir_path = dir_path + "/"
    txt_list = []
    for file in os.listdir(dir_path):
        file = dir_path + file
        fin = open(file, 'r')
        txt = " ".join(fin.readlines())
        txt = convert_text(txt)
        txt_list.append(txt)
    return txt_list

def read_datasets(pos_path, neg_path):
    positive = read_txts(pos_path)
    negative = read_txts(neg_path)
    return positive, negative


def split_data(pos, neg):
    positive_train, negative_train = pos[:700], neg[:700]
    positive_test, negative_test = pos[700:],neg[700:]
    return positive_train, negative_train, positive_test, negative_test

def make_labels(data, is_positive = True):
        if is_positive:
            return np.array([1 for x in data])
        else:
            return np.array([0 for x in data])

def score_accuracy(test_Y, predicted):
    return print ("Accuracy is", np.mean(test_Y == predicted))

#### №7 A method to determine review type

In [3]:
def determine_type(review, classifier, vectorizer):
    txt_list = []
    review = convert_text(review)
    txt_list.append(review)
    X = vectorizer.transform(txt_list).toarray()
    prediction = classifier.predict(X)
    if prediction[0] == 1:
        return print('The review is positive')
    else:
        return print ('The review is negative')

#### Preparing data

In [4]:
# Reading data
positive, negative = read_datasets(pos_path, neg_path)

# Splitting data 
positive_train, negative_train, positive_test, negative_test = split_data(positive, negative)

# Preparing labels
pos_train_labels, neg_train_labels = make_labels(positive_train), make_labels(negative_train, False)
pos_test_lables, neg_test_labels = make_labels(positive_test), make_labels(negative_test, False)

# Training data
train_X = positive_train + negative_train
train_Y = np.append(pos_train_labels, neg_train_labels)

# Testing data
test_X = positive_test + negative_test
test_Y = np.append(pos_test_lables, neg_test_labels)

# Training vectorizer
vectorizer = CountVectorizer().fit(train_X)

# Applying vectorization
train_X =vectorizer.transform(train_X).toarray()
test_X = vectorizer.transform(test_X).toarray()

In [5]:
class PoissonNB:
    def __init__(self, class_prior=None):
        """
        class_prior : np.array, size (n_classes,)
        Prior probabilities of the classes. If specified the priors are not
        adjusted according to the data.
        """
        self.probabilities = class_prior
        
    
    def fit(self, X, y, epsilon=1e-9):
        """
        Fit Poisson Naive Bayes according to X, y
        
        Parameters
        ----------
        X : np.array, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : np.array, shape (n_samples,)
            Target values.
        """
        self.epsilon = epsilon
        
        n_samples = X.shape[0]
        n_features = X.shape[1]
        
        classes = set(y)
        self.classes = classes
        splitted_samples = {}
        
        for y_class in classes:
            splitted_samples[y_class] = np.array([])
            
        for i in range(0, n_samples):
            splitted_samples[y[i]] = np.append(splitted_samples[y[i]], X[i])
            
        self.num_classes = len(classes)
        self.lambdas = np.zeros((self.num_classes, n_features))
        
        for y_class in classes:
            # Taking all samples of a single class
            class_samples = splitted_samples[y_class]
            num_class_samples = len(class_samples)

            # Going through each feature of class samples
            self.lambdas[y_class] = np.sum(class_samples, axis=0)

        print('Classifier is trained')

    def calculate_arg_max(self, x):
        maxargs = []
        total_sum = 0.0
        for i in range(0, len(self.classes)):
            for j in range(0, x.shape[0]):
                if self.probabilities is None:
                    probability = np.random.uniform()
                else:
                    probability = self.probabilities[i]

                log = x[j] * np.log(self.lambdas[i][j] + self.epsilon)
                total_sum += log - self.lambdas[i][j] + np.log(probability + self.epsilon)

            maxargs.append(total_sum)

        return maxargs.index(max(maxargs))    

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.
        
        Parameters
        ----------
        X : np.array, shape = [n_samples, n_features]
        
        Returns
        -------
        C : np.array, shape = [n_samples]
            Predicted target values for X
        """
        return np.array([self.calculate_arg_max(x) for x in X])

#### Gaussian classifier

In [9]:
gnb = GaussianNB()
gnb.fit(train_X, train_Y)
y_predicted_GNB = gnb.predict(test_X)
score_accuracy(test_Y, y_predicted_GNB)

Accuracy is 0.621666666667


#### Multinomial classifier

In [10]:
mnb = MultinomialNB()
mnb.fit(train_X, train_Y)
y_predicted_MNB = mnb.predict(test_X)
score_accuracy(test_Y, y_predicted_MNB)

Accuracy is 0.816666666667


#### Poisson classifier

In [59]:
pnb = PoissonNB()
pnb.fit(train_X, train_Y)
y_predicted_PNB = pnb.predict(test_X)
score_accuracy(test_Y, y_predicted_PNB)

Classifier is trained
Accuracy is 0.5


#### Testing method from №7

In [11]:
determine_type("""films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . 
for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . 
to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . 
the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . 
in other words , don't dismiss this film because of its source . 
if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . 
getting the hughes brothers to direct this seems almost as ludicrous as casting carrot top in , well , anything , but riddle me this : who better to direct a film that's set in the ghetto and features really violent street crime than the mad geniuses behind menace ii society ? 
the ghetto in question is , of course , whitechapel in 1888 london's east end . 
it's a filthy , sooty place where the whores ( called " unfortunates " ) are starting to get a little nervous about this mysterious psychopath who has been carving through their profession with surgical precision . 
when the first stiff turns up , copper peter godley ( robbie coltrane , the world is not enough ) calls in inspector frederick abberline ( johnny depp , blow ) to crack the case . 
abberline , a widower , has prophetic dreams he unsuccessfully tries to quell with copious amounts of absinthe and opium . 
upon arriving in whitechapel , he befriends an unfortunate named mary kelly ( heather graham , say it isn't so ) and proceeds to investigate the horribly gruesome crimes that even the police surgeon can't stomach . 
i don't think anyone needs to be briefed on jack the ripper , so i won't go into the particulars here , other than to say moore and campbell have a unique and interesting theory about both the identity of the killer and the reasons he chooses to slay . 
in the comic , they don't bother cloaking the identity of the ripper , but screenwriters terry hayes ( vertical limit ) and rafael yglesias ( les mis ? rables ) do a good job of keeping him hidden from viewers until the very end . 
it's funny to watch the locals blindly point the finger of blame at jews and indians because , after all , an englishman could never be capable of committing such ghastly acts . 
and from hell's ending had me whistling the stonecutters song from the simpsons for days ( " who holds back the electric car/who made steve guttenberg a star ? " ) . 
don't worry - it'll all make sense when you see it . 
now onto from hell's appearance : it's certainly dark and bleak enough , and it's surprising to see how much more it looks like a tim burton film than planet of the apes did ( at times , it seems like sleepy hollow 2 ) . 
the print i saw wasn't completely finished ( both color and music had not been finalized , so no comments about marilyn manson ) , but cinematographer peter deming ( don't say a word ) ably captures the dreariness of victorian-era london and helped make the flashy killing scenes remind me of the crazy flashbacks in twin peaks , even though the violence in the film pales in comparison to that in the black-and-white comic . 
oscar winner martin childs' ( shakespeare in love ) production design turns the original prague surroundings into one creepy place . 
even the acting in from hell is solid , with the dreamy depp turning in a typically strong performance and deftly handling a british accent . 
ians holm ( joe gould's secret ) and richardson ( 102 dalmatians ) log in great supporting roles , but the big surprise here is graham . 
i cringed the first time she opened her mouth , imagining her attempt at an irish accent , but it actually wasn't half bad . 
the film , however , is all good . 
2 : 00 - r for strong violence/gore , sexuality , language and drug content 
 
""", mnb, vectorizer)


The review is positive


#### №8 Сделайте выводы, почему наивный байесовский классификатор плохо или хорошо работает для данной задачи
Вывод: наивный байесовский классификатор является <u>эффективным</u> алгоритмом для классификации текстов, потому что показывает высокую точность классификации, хорошо работает даже с небольшим количеством данных для обучения, относительно прост в реализации, быстр и стабилен, даже не смотря на допущение о независимости признаков.

### Bonus 2
Applying 3 kind of classificators to <b>polarity dataset v0.9</b> (downloaded from http://www.cs.cornell.edu/People/pabo/movie-review-data/)

download link: http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens.zip

#### Preparing data

In [12]:
new_pos_path = "./tokens/pos/"
new_neg_path = "./tokens/neg/"

# Reading data
n_positive, n_negative = read_datasets(new_pos_path, new_neg_path)

# Splitting data
n_positive_train, n_negative_train = n_positive[:500], n_negative[:500]
n_positive_test, n_negative_test = n_positive[500:],n_negative[500:]

# Making labels
n_pos_train_labels, n_neg_train_labels = make_labels(n_positive_train), make_labels(n_negative_train, False)
n_pos_test_lables, n_neg_test_labels = make_labels(n_positive_test), make_labels(n_negative_test, False)

# Training data
new_train_X = n_positive_train + n_negative_train
new_train_Y = np.append(n_pos_train_labels, n_neg_train_labels)

# Testing data
new_test_X = n_positive_test + n_negative_test
new_test_Y = np.append(n_pos_test_lables, n_neg_test_labels)

# Training vectorizer
new_vectorizer = CountVectorizer().fit(new_train_X)

# Applying vectorization
new_train_X = vectorizer.transform(new_train_X).toarray()
new_test_X = vectorizer.transform(new_test_X).toarray()

#### Gaussian Classification

In [13]:
gnb.fit(new_train_X, new_train_Y)
new_y_predicted_GNB = gnb.predict(new_test_X)
score_accuracy(new_test_Y, new_y_predicted_GNB)

Accuracy is 0.635


#### Multinomial classification

In [15]:
mnb.fit(new_train_X, new_train_Y)
new_y_predicted_MNB = mnb.predict(new_test_X)
score_accuracy(new_test_Y, new_y_predicted_MNB)

Accuracy is 0.835


#### Poisson classification

In [17]:
pnb.fit(new_train_X, new_train_Y)
new_y_predicted_PNB = pnb.predict(new_test_X)
score_accuracy(new_test_Y, new_y_predicted_PNB)

Classifier is trained
Accuracy is 0.5
