Análise de sentimentos com validação 

In [1]:
import pandas as pd 
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import RSLPStemmer
import re

In [3]:
df1 = pd.read_table('/home/leticia/classify_texts/sentimentanalysis/imdb_labelled.txt',names = ['text', 'labels'],sep = ' \t')

df2 = pd.read_table('/home/leticia/classify_texts/sentimentanalysis/amazon_cells_labelled.txt',sep ='\t',names = ['text', 'labels'])

df3 = pd.read_table('/home/leticia/classify_texts/sentimentanalysis/yelp_labelled.txt',sep ='\t',names = ['text', 'labels'])

df = pd.concat([df1,df2,df3])

print (df)

                                                  text  labels
0    A very, very, very slow-moving, aimless movie ...       0
1    Not sure who was more lost - the flat characte...       0
2    Attempting artiness with black & white and cle...       0
3          Very little music or anything to speak of.        0
4    The best scene in the movie was when Gerardo i...       1
5    The rest of the movie lacks art, charm, meanin...       0
6                                   Wasted two hours.        0
7    Saw the movie today and thought it was a good ...       1
8                                  A bit predictable.        0
9    Loved the casting of Jimmy Buffet as the scien...       1
10                 And those baby owls were adorable.        1
11   The movie showed a lot of Florida at it's best...       1
12   The Songs Were The Best And The Muppets Were S...       1
13                                    It Was So Cool.        1
14   This is a very "right on case" movie that deli... 

  """Entry point for launching an IPython kernel.


In [4]:
def Tokenize(sentence):
    sentence = sentence.lower()
    sentence = nltk.word_tokenize(sentence)
    return sentence

In [5]:
def Stemming(sentence):
    stemmer = RSLPStemmer()
    phrase = []
    for word in sentence:
        phrase.append(stemmer.stem(word.lower()))
    return phrase

In [7]:
train_size = 0.8

logreg = LogisticRegression()
vectorizer = CountVectorizer(analyzer="word",tokenizer=None)    
pipe = Pipeline([ ('vect', vectorizer),('logreg', logreg)])

X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(
df.text, df.labels, train_size=train_size)

df_train = pd.DataFrame(X_train)
df_train['labels']=Y_train
df_train =df_train.reset_index(drop=True)

df_test = pd.DataFrame(X_test,index=None)
df_test['labels']=Y_test
df_test=df_test.reset_index(drop=True)



In [19]:
df_train

Unnamed: 0,text,labels
0,Someone shouldve invented this sooner.,1
1,We loved the biscuits!!!,1
2,"The service was a little slow , considering th...",0
3,On three different occasions I asked for well ...,0
4,They brought a fresh batch of fries and I was ...,0
5,"I had heard good things about this place, but ...",1
6,The selection of food was not the best.,0
7,I have 2-3 bars on my cell phone when I am hom...,0
8,VERY funny!,1
9,At a time when it seems that film animation ha...,1


In [18]:
df_test

Unnamed: 0,text,labels
0,I'll be looking for a new earpiece.,0
1,The restaurant atmosphere was exquisite.,1
2,"I probably won't be back, to be honest.",0
3,If you stay in Vegas you must get breakfast he...,1
4,Poorly contstruct hinge.,0
5,"There is really nothing for me at postinos, ho...",0
6,Its well-designed and very sharp -- the blue i...,1
7,I have this phone and it is a thorn in my side...,0
8,"The camera, although rated at an impressive 1....",0
9,The sangria was about half of a glass wine ful...,0


In [8]:
def Learning(df_train):
    corpus_words = {}
    
    
    for index, row in df.iterrows():
        phrase = row['text']
        phrase = str(phrase)
        phrase = re.sub(r'[.|,|?|!]','', phrase)
        phrase = Tokenize(phrase)
        phrase = Stemming(phrase)
        class_name = row['labels']
        if class_name not in list(corpus_words.keys()):
            corpus_words[class_name] = {}
        for word in phrase:
            if word not in list(corpus_words[class_name].keys()):
                corpus_words[class_name][word] = 1
            else:
                corpus_words[class_name][word] += 1
    
    return corpus_words

In [9]:
data = Learning(df_train)


In [10]:
data


{0: {'a': 420,
  'very': 97,
  'slow-moving': 1,
  'aimles': 1,
  'movi': 107,
  'about': 46,
  'distressed': 1,
  'drifting': 1,
  'young': 1,
  'man': 5,
  'not': 260,
  'sur': 11,
  'who': 23,
  'wa': 332,
  'mor': 33,
  'lost': 7,
  '-': 42,
  'the': 970,
  'flat': 4,
  'charact': 22,
  'or': 52,
  'audienc': 2,
  'nearly': 2,
  'half': 8,
  'of': 314,
  'whom': 2,
  'walked': 5,
  'out': 48,
  'attempting': 1,
  'artines': 1,
  'with': 116,
  'black': 8,
  '&': 14,
  'whit': 3,
  'and': 460,
  'clev': 2,
  'cam': 22,
  'angl': 2,
  'disappointed': 29,
  'becam': 2,
  'even': 48,
  'ridicul': 6,
  'as': 90,
  'acting': 27,
  'po': 27,
  'plot': 23,
  'lin': 16,
  'almost': 10,
  'non-existent': 1,
  'littl': 20,
  'music': 7,
  'anything': 11,
  'to': 361,
  'speak': 3,
  'rest': 4,
  'lack': 7,
  'art': 2,
  'charm': 2,
  'meaning': 2,
  'if': 64,
  'it': 431,
  "'s": 102,
  'emptines': 1,
  'work': 44,
  'i': 546,
  'gues': 8,
  'becaus': 33,
  'empty': 4,
  'wasted': 9,
  'two':

In [11]:
def calculate_class_score(sentence,class_name):
    score = 0 
    sentence = Tokenize(sentence)
    sentence = Stemming(sentence)
    for word in sentence:
        if word in data[class_name]:
            score += data[class_name][word]
    return score

In [21]:
def calculate_score(sentence):

    high_score = 0
    classname = 'default'
    
    
    for index, row in df_test.iterrows():    
        pontos = 0
        classe = row['labels']
        sentence = row['text']
        pontos = calculate_class_score(sentence,classe)
        high_score = 0
        if pontos > high_score:
            high_score = pontos
            
            if classe == 0:
                classname = 'Negative'
                print('\n' + sentence +' | '+ classname)
            else:
                classname = 'Positive'
                print('\n' + sentence +' | '+ classname)
    return classname,high_score

In [22]:
print(calculate_score(df_test))  



I'll be looking for a new earpiece. | Negative

The restaurant atmosphere was exquisite. | Positive

I probably won't be back, to be honest. | Negative

If you stay in Vegas you must get breakfast here at least once. | Positive

Poorly contstruct hinge. | Negative

There is really nothing for me at postinos, hope your experience is better | Negative

Its well-designed and very sharp -- the blue is a very nice color. | Positive

I have this phone and it is a thorn in my side, I really abhor it. | Negative

The camera, although rated at an impressive 1.3 megapixels, renders images that fall well below expectations of such a relatively high resolution. | Negative

The sangria was about half of a glass wine full and was $12, ridiculous. | Negative

The commercials are the most misleading. | Negative

This place is hands-down one of the best places to eat in the Phoenix metro area. | Positive

I don't think I've ever gone to a movie and disliked it as much.  | Negative

This movie is well-

Anyway, I do not think i will go back there. | Negative

Does not fit. | Negative

I had the opportunity today to sample your amazing pizzas! | Positive

I went to Bachi Burger on a friend's recommendation and was not disappointed. | Positive

it did not work in my cell phone plug i am very up set with the charger!. | Negative

This allows the possibility of double booking for the same date and time after the first. | Negative

They know how to make them here. | Positive

Plus, I seriously do not believe it is worth its steep price point. | Negative

But the service was beyond bad. | Negative

Also the music by Mark Snow is possibly the best score I've ever heard.  | Positive

I find this inexcusable and so will probably be returning this phone and perhaps changing carriers. | Negative

I seriously cannot believe that the owner has so many unexperienced employees that all are running around like chickens with their heads cut off. | Negative

I never come again. | Negative

This is a gr

The case is great and works fine with the 680. | Positive

Pros:-Good camera - very nice pictures , also has cool styles like black and white, and more. | Positive

I highly recommend this case. | Positive

The budget was evidently very limited.  | Negative

The only thing that disappoint me is the infra red port (irda). | Negative

I was proven dead wrong by this sushi bar, not only because the quality is great, but the service is fast and the food, impeccable. | Positive

Needless to say, I won't be going back anytime soon. | Negative

The sergeant pepper beef sandwich with auju sauce is an excellent sandwich as well. | Positive

But "Tiny Toons" kept the 90's vibe and delivered one of the most popular, funny, and underrated cartoons ever created.  | Positive

The show would begin with smart ass ed comments to each other that would be totally off the wall and uncalled for.  | Negative

All in all a beautiful directed film from Nicola's roeg wih a sublime cast.  | Positive

Tasted lik

In [23]:
pipe.fit(X_train, Y_train)
accuracy = pipe.score(X_test, Y_test)
msg = "\nAccuracy with {:.0%} of training data: {:.1%}\n".format(train_size, accuracy)
print (msg)
pipe.fit(df.text, df.labels)


Accuracy with 80% of training data: 83.2%



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])