
# Sentiment - Analyser #

The "score" attribute we stored in every comment entry inside our database is already
a ruff estimation of the sentiment of a comment.<br>
However, this score can currently be any number which can vary substantially depending on the number
of likes a comment received... <br>
Besides that there is also the observation that a high like to dislike ratio on
youtube videos is the defacto standard, this is because human being mostly tend
to like positivity. :) <br>
Therefore we need to to the two following things in order to get a "true" sentiment: <br>

- normalize so that the sentiment is between 0 and 1 (for all comments of a video)

- shift the score a little so that a sentiment of 0.7 is more like 0.5

In [1]:
def sentiment_from_score(video_likes, video_dislikes, min_score, max_score, score):
        ratio = float(video_likes)/(float(video_likes)+float(video_dislikes))
        score, min_score, max_score = float(score), float(min_score), float(max_score)
        scaler = max_score - min_score
        if scaler == 0.0 : scaler = 1
        normalized_score = (( score - min_score ) / scaler) / 2
        assert normalized_score <= 0.5 and normalized_score >= 0.0
        capper = min(1, ratio/0.75)**2 
        normalized_score =  capper * 0.5 + (( score - min_score ) / scaler) / 2
        assert normalized_score <= 1 and normalized_score >= 0.0 
        return normalized_score

## Loading Data ##

Before starting with the machine learning and sentiment analysis
we first have to load the data and clean it! 

In [3]:

from neo4j import GraphDatabase

uri, user, password = 'bolt://localhost:11005', 'neo4j', 'neo4j_'

driver = GraphDatabase.driver(uri, auth=(user, password))
 
words = []
video_max_comment_score = dict()
video_min_comment_score = dict()
comments = []
sentiments = []

with driver.session() as session:
    def _q(query) : return session.run(query)
    #---------------------------------------

    result = _q("MATCH (n:Author)-[c:COMMENTED]->(v:Video) RETURN n, c, v") # remove all graphs and nodes! BE CAREFUL!
 
    records = []

    # First! : We need to do some normalization!
    # Let's find the min/max scores for every video
    for record in result:
        if record['v']['title'] not in video_max_comment_score:
            video_max_comment_score[record['v']['title']] = float(record['c']['score'])
        elif float(video_max_comment_score[record['v']['title']]) > float(record['c']['score']):
            video_max_comment_score[record['v']['title']] = float(record['c']['score'])

        if record['v']['title'] not in video_min_comment_score:
            video_min_comment_score[record['v']['title']] = float(record['c']['score'])
        elif float(video_min_comment_score[record['v']['title']]) < float(record['c']['score']):
            video_min_comment_score[record['v']['title']] = float(record['c']['score'])
            
        if float(record['c']['likes']) > 0 :
            records.append(record)

    for record in records:
        normalized_score = sentiment_from_score(
            record['v']['likes'], 
            record['v']['dislikes'], 
            video_min_comment_score[record['v']['title']], 
            video_max_comment_score[record['v']['title']], 
            record['c']['score']
        )
        sentiments.append(normalized_score)
        
        newComment = (("%s | %s" % (record['v']['title'],record['c']['text'])).lower()
            .replace('❤️', ' heart ').replace('💯', ' 100 ').replace('❤', ' heart ')
            .replace('🙏', ' pray ').replace('😘', ' kiss ').replace('🤗', ' happy ')
            .replace('💥', ' boom ').replace('✔️', ' like ').replace('😍', ' love ')
            .replace('🐱', ' cat ').replace('💔', ' broken heart ').replace('😵', ' confused ') 
            .replace('😄', ' awesome ').replace('👍', ' thumbs up ').replace('😎', ' cool ')
            .replace('🐷', ' pig ').replace('🤘', " rock'n roll ").replace('🤣', ' laughing hard ')
            .replace('😩', ' oh no ').replace('💎', ' diamond ').replace('😊', ' nice ')
            .replace('☺️', ' very nice ').replace('🙃', ' upside down smile ').replace('🤔', ' not sure ')
            .replace('😂', ' laughing ')
            .replace('!!!!', '!').replace('!!!', '!').replace('!!', '!')
            .replace('????', '?').replace('???', '?').replace('??', '?')
            .replace('oooo','o').replace('oooo','o').replace('ooo','o').replace('oo','o')
            .replace('..','...').replace('......','...').replace('....','...').replace('....','...').replace('...',' ... ')
            .replace('    ', ' ').replace('   ', ' ').replace('  ', ' ')
            .split() 
        )
        print(newComment, "-|-", float(record['c']['score']))
        words.extend(newComment)
        comments.append(newComment)
    
    #---------------------------------------
driver.close()
 
#from pprint import pprint
#pprint(comments)
    

['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'great', 'series'] -|- 1.9347826086956523
['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'please', 'upload', '2', '3', 'videos', 'weekly'] -|- 0.9673913043478262
['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'your', 'space', 'invader', 'game', 'tutorial', 'is', 'amazing!', 'hope', 'to', 'see', 'more', 'videos', 'on', 'python', 'during', 'quarantine!'] -|- 0.9673913043478262
['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'love', 'the', 'hashtags', 'lol'] -|- 0.9673913043478262
['sentiment', 'analysis', 'python', '-', '3', '-', 'cleaning', 'text', 'for', 'natural', 'language', 'processing', '(nlp)', '|', 'hey', 'man', 'long', 'time', '!'] -|- 0.9669421487603307
['sentiment', 'analysis', 'pytho

In [4]:
print(comments[56])
print(sentiments[56])

['grovy', 'tutorial:', 'learn', 'grovy', 'dsl', 'by', 'example', 'in', '20', 'minutes', '|', 'thanks', 'for', 'sharing', 'your', 'knowledge', '-', 'much', 'appreciated.']
0.9285714285714286


In [5]:
print(comments[79])
print(sentiments[79])

['if', 'programming', 'was', 'an', 'anime', '|', "'time", 'to', 'bring', 'out', 'my', 'secret', "weapon'", 'stackoverflow', 'has', 'entered', 'the', 'chat']
0.9971862391165853


In [8]:
worst_i = -1
worst_score = 1
for i, score in enumerate( sentiments ):
    #print(i,score)
    if score < worst_score : 
        worst_score = score
        worst_i = i
        
print(comments[worst_i])
print(sentiments[worst_i])

['asmr*', 'stewed', "dog's", 'thighs.', 'extremely', 'delicious', 'and', 'nutritious(eating', 'sounds)', 'no', 'talking', '|', 'vi', 'asmr', '|', 'coi', 'chừng', 'bị', 'quật', 'á', 'chị,', 'gì', 'mà', 'ăn', 'chó', 'mèo', 'không', 'vậy?', 'chị', 'có', 'còn', 'là', 'con', 'người', 'ko', 'vậy?']
0.018602992696261058


## Let's clean the text with spark ##

The comments are currently still full of redundancies and
many uncommon tokens / word.
In order to be able to filter the comments we use spark to count their occurrences.

In [9]:
from pyspark import SparkContext
 
sc = SparkContext("local", "First App")
 
wordsRDD = sc.parallelize(words, 6)

zipped = (wordsRDD
            .map(lambda word : (word,1))
            .reduceByKey( lambda a, b : a+b )
            .collect())
sc.stop()                  

occurrences = {}

for word, count in zipped:
    occurrences[word] = count


...and now let's remove rare tokens / words in order to cap the maximum amount of possible words a bit...

In [10]:

def more_than_three(word):
    return occurrences[word] > 3

for i, comment in enumerate(comments): 
    comments[i] = list(filter(more_than_three, comment))

We also filter all comments which are under and over a certain size
in order to make training easier.

In [15]:

def removed_short_comments(all_comments, all_sentiments):
    new_comments, new_sentiments = [], []
    
    for i, comment in enumerate(all_comments): 
        if len(comment) > 12 :
            new_comments.append(comment)
            new_sentiments.append(all_sentiments[i])
        
    return new_comments, new_sentiments

print(len(comments))
print(len(sentiments))

clean_comments, clean_sentiments = removed_short_comments(comments, sentiments)

print(len(clean_comments))
print(len(clean_sentiments))


685
685
575
575


In [16]:
print(clean_comments)

[['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'great', 'series'], ['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'please', '2', '3', 'videos'], ['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'your', 'game', 'tutorial', 'is', 'hope', 'to', 'see', 'more', 'videos', 'on', 'python', 'during'], ['sentiment', 'analysis', 'python', '-', '1', '-', 'introduction', 'to', 'emotion', 'analysis', '(nlp)', '|', 'love', 'the', 'lol'], ['sentiment', 'analysis', 'python', '-', '3', '-', 'for', 'language', '(nlp)', '|', 'hey', 'man', 'long', 'time', '!'], ['sentiment', 'analysis', 'python', '-', '3', '-', 'for', 'language', '(nlp)', '|', 'thank', 'you', '...'], ['sentiment', 'analysis', 'python', '-', '3', '-', 'for', 'language', '(nlp)', '|', 'keep', 'up', 'the', 'work', 'man', 'we', 'love', 'it.', 'at', 'the', 'en

---

# Sentiment Analysis with Pytorch #

Let's train a neural network on our data!

In [17]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x21769846310>

In [70]:
# src/inspiration: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
class LSTMScorePredictor(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim, actifun='softmax'):
        super(LSTMScorePredictor, self).__init__() 
        self.hidden_dim = hidden_dim
        self.actifun = actifun

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
            
        # The linear layer that maps from hidden state space to tag space
        self.lin = nn.Linear(hidden_dim, output_dim)

    def forward(self, sentence): 
        #print('sentence: ', sentence)
        embeds = self.word_embeddings(sentence)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        out = self.lin(lstm_out.view(len(sentence), -1))
        if self.actifun == 'softmax' : out = F.log_softmax(out, dim=1)
        elif self.actifun == 'sigmoid' : out = F.sigmoid(out) 
        
        return out

## Working with the model: ##

In [153]:

# Word to index encoding...
class Encoder:
    
    def __init__(self, training_data):
        self.word_to_ix = {}
        # For each words-list (sentence) and tags-list in each tuple of training_data
        for sent, label in training_data:
            for word in sent:
                if word not in self.word_to_ix:  # word has not been assigned an index yet
                    self.word_to_ix[word] = len(self.word_to_ix)  # Assign each word with a unique index
        #print(self.word_to_ix)
        self.label_to_ix = {"DET": 0, "NN": 1, "V": 2}  # Assign each tag with a unique index

    def sequence_words(self, seq):
        idxs = [self.word_to_ix[w] for w in seq]
        return torch.tensor(idxs, dtype=torch.long)
    
    
    def sequence_labels(self, seq):
        idxs = [self.label_to_ix[w] for w in seq]
        return torch.tensor(idxs, dtype=torch.long)

In [157]:

def exec_trial(
    training_data, 
    loss_function=None, 
    optimizer=None,
    epochs=300,
    do_label_encode=True,
    actifun='softmax', 
    embedding_dim=6, # These will usually be more like 32 or 64 dimensional.
    hidden_dim=6,# We will keep them small, so we can see how the weights change as we train.
    enc=None
):
    torch.manual_seed(42) 
    if enc == None: enc = Encoder(training_data)
        
    if not do_label_encode: output_dim = 1
    else: output_dim = len(enc.label_to_ix)
    
    model = LSTMScorePredictor(
        embedding_dim=embedding_dim, 
        hidden_dim=hidden_dim, 
        vocab_size=len(enc.word_to_ix), 
        output_dim=output_dim,
        actifun=actifun
    )
    if loss_function == 'mse': loss_function = nn.MSELoss()
    elif loss_function == None: loss_function = nn.NLLLoss()
    
    if optimizer == 'adam': optimizer = optim.ADAM(model.parameters(), lr=0.0001)
    elif optimizer == None: optimizer = optim.SGD(model.parameters(), lr=0.1)
  
    for epoch in range(epochs):  
        print('Epoch',epoch)
        for sentence, tags in training_data: 
            model.zero_grad() 
            sentence_in = enc.sequence_words(sentence)
            if do_label_encode: targets = enc.sequence_labels(tags)
            else: targets = tags 
            tag_scores = model(sentence_in) 
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()
        print('loss =',loss.item())
            
    return model

## Unit-Testing the Model: ##

In [158]:
 
# Example training data:
training_data = [ 
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

model = exec_trial(training_data)

# See what the scores are after training
with torch.no_grad():
    inputs = enc.sequence_words(training_data[0][0])
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print(inputs)
    print(tag_scores)
    
assert str(inputs) == """tensor([0, 1, 2, 3, 4])"""
assert str(tag_scores) == """tensor([[-0.0259, -4.5027, -4.2358],
        [-4.5316, -0.0617, -3.0154],
        [-2.6551, -2.9622, -0.1301],
        [-0.1583, -3.9917, -2.0562],
        [-4.2915, -0.0241, -4.5954]])"""


print('Unit test successful! :)')

Epoch 0
loss = 1.1654255390167236
Epoch 1
loss = 1.1565314531326294
Epoch 2
loss = 1.148512840270996
Epoch 3
loss = 1.141252040863037
Epoch 4
loss = 1.134647011756897
Epoch 5
loss = 1.1286096572875977
Epoch 6
loss = 1.1230638027191162
Epoch 7
loss = 1.1179441213607788
Epoch 8
loss = 1.1131938695907593
Epoch 9
loss = 1.1087646484375
Epoch 10
loss = 1.1046143770217896
Epoch 11
loss = 1.1007071733474731
Epoch 12
loss = 1.0970114469528198
Epoch 13
loss = 1.0935003757476807
Epoch 14
loss = 1.0901503562927246
Epoch 15
loss = 1.086941123008728
Epoch 16
loss = 1.0838547945022583
Epoch 17
loss = 1.0808758735656738
Epoch 18
loss = 1.0779906511306763
Epoch 19
loss = 1.0751869678497314
Epoch 20
loss = 1.0724540948867798
Epoch 21
loss = 1.0697824954986572
Epoch 22
loss = 1.0671638250350952
Epoch 23
loss = 1.0645900964736938
Epoch 24
loss = 1.0620546340942383
Epoch 25
loss = 1.0595508813858032
Epoch 26
loss = 1.0570732355117798
Epoch 27
loss = 1.0546163320541382
Epoch 28
loss = 1.0521754026412964
Ep

loss = 0.07790349423885345
Epoch 246
loss = 0.07686430960893631
Epoch 247
loss = 0.07584697008132935
Epoch 248
loss = 0.07485097646713257
Epoch 249
loss = 0.07387590408325195
Epoch 250
loss = 0.07292123138904572
Epoch 251
loss = 0.07198630273342133
Epoch 252
loss = 0.07107098400592804
Epoch 253
loss = 0.07017450034618378
Epoch 254
loss = 0.06929653882980347
Epoch 255
loss = 0.06843660026788712
Epoch 256
loss = 0.06759430468082428
Epoch 257
loss = 0.06676913052797318
Epoch 258
loss = 0.0659608244895935
Epoch 259
loss = 0.06516876816749573
Epoch 260
loss = 0.06439273059368134
Epoch 261
loss = 0.06363218277692795
Epoch 262
loss = 0.06288691610097885
Epoch 263
loss = 0.06215636804699898
Epoch 264
loss = 0.06144030764698982
Epoch 265
loss = 0.06073833256959915
Epoch 266
loss = 0.06005008891224861
Epoch 267
loss = 0.059375207871198654
Epoch 268
loss = 0.05871344357728958
Epoch 269
loss = 0.058064401149749756
Epoch 270
loss = 0.057427823543548584
Epoch 271
loss = 0.05680333822965622
Epoch 272

In [151]:
# Test training data: 
training_data = [ 
    ("The dog ate the apple".split(), torch.tensor([[1], [0], [0.5], [0.3], [1]])),
    ("Everybody read that book".split(), torch.tensor([[[0.2], [0], [1], [0]]]))
]
model = exec_trial(
    training_data, 
    loss_function='mse', 
    do_label_encode=False,
    actifun='sigmoid',
    epochs=500
)
      
# See what the scores are after training
with torch.no_grad():
    inputs = enc.sequence_words(training_data[0][0])
    pred1 = model(inputs) 
    inputs = enc.sequence_words(training_data[1][0])
    pred2 = model(inputs)
          
assert hash(str(pred1.cpu().detach().numpy())) == 8299978301676431484
assert hash(str(pred2.cpu().detach().numpy())) == 4865758014180741225

print('Unit test successful! :)')

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
Epoch 0
loss = 0.26555222272872925
Epoch 1
loss = 0.26313287019729614
Epoch 2
loss = 0.2607865333557129
Epoch 3
loss = 0.2585119605064392
Epoch 4
loss = 0.25630760192871094
Epoch 5
loss = 0.25417202711105347
Epoch 6
loss = 0.2521035671234131
Epoch 7
loss = 0.25010067224502563
Epoch 8
loss = 0.24816171824932098
Epoch 9
loss = 0.24628493189811707
Epoch 10
loss = 0.24446865916252136
Epoch 11
loss = 0.2427111566066742
Epoch 12
loss = 0.24101075530052185
Epoch 13
loss = 0.23936571180820465
Epoch 14
loss = 0.2377743124961853
Epoch 15
loss = 0.2362349033355713
Epoch 16
loss = 0.23474577069282532
Epoch 17
loss = 0.23330527544021606
Epoch 18
loss = 0.2319118082523346
Epoch 19
loss = 0.23056377470493317
Epoch 20
loss = 0.22925961017608643
Epoch 21
loss = 0.2279978096485138
Epoch 22
loss = 0.2267768681049347
Epoch 23
loss = 0.22559532523155212
Epoch 24
loss = 0.22445175051689148
Epoch 25
loss = 0

loss = 0.12050718069076538
Epoch 246
loss = 0.11978715658187866
Epoch 247
loss = 0.11906775087118149
Epoch 248
loss = 0.11834906041622162
Epoch 249
loss = 0.11763130128383636
Epoch 250
loss = 0.11691459268331528
Epoch 251
loss = 0.1161990761756897
Epoch 252
loss = 0.11548492312431335
Epoch 253
loss = 0.11477230489253998
Epoch 254
loss = 0.11406134814023972
Epoch 255
loss = 0.11335219442844391
Epoch 256
loss = 0.11264500021934509
Epoch 257
loss = 0.11193986237049103
Epoch 258
loss = 0.11123697459697723
Epoch 259
loss = 0.11053644120693207
Epoch 260
loss = 0.10983836650848389
Epoch 261
loss = 0.10914289206266403
Epoch 262
loss = 0.10845017433166504
Epoch 263
loss = 0.10776026546955109
Epoch 264
loss = 0.10707330703735352
Epoch 265
loss = 0.10638943314552307
Epoch 266
loss = 0.10570873320102692
Epoch 267
loss = 0.10503127425909042
Epoch 268
loss = 0.10435719788074493
Epoch 269
loss = 0.10368658602237701
Epoch 270
loss = 0.10301952064037323
Epoch 271
loss = 0.10235609114170074
Epoch 272
lo

loss = 0.034644708037376404
Epoch 487
loss = 0.03452007472515106
Epoch 488
loss = 0.034396253526210785
Epoch 489
loss = 0.03427322208881378
Epoch 490
loss = 0.034150995314121246
Epoch 491
loss = 0.03402954339981079
Epoch 492
loss = 0.03390888124704361
Epoch 493
loss = 0.03378898650407791
Epoch 494
loss = 0.0336698442697525
Epoch 495
loss = 0.03355148434638977
Epoch 496
loss = 0.033433858305215836
Epoch 497
loss = 0.033316999673843384
Epoch 498
loss = 0.03320087492465973
Epoch 499
loss = 0.03308549150824547
Unit test successful! :)


## Predicting the Sentiment Score ##

In [177]:
import random 

def list_splitter(list_to_split, ratio):
    elements = len(list_to_split)
    middle = int(elements * ratio)
    return [list_to_split[:middle], list_to_split[middle:]]


clean_training_data = []

for i, comment in enumerate(clean_comments):
    clean_training_data.append(
        (comment, torch.tensor([[clean_sentiments[i]]]*len(comment)))
    )
    
random.Random(66642999).shuffle(clean_training_data)
    
train, test = list_splitter(clean_training_data, 0.8)

print(len(train),'/',len(test))

460 / 115


In [172]:

    
enc = Encoder(clean_training_data)
    
model = exec_trial(
    training_data=train, 
    loss_function='mse', 
    do_label_encode=False,
    actifun='sigmoid',
    epochs=5,
    enc=enc
)

#result = model.eval(test)
#print(result)


460 / 115
Epoch 0
loss = 0.019581208005547523
Epoch 1
loss = 0.016592737287282944
Epoch 2
loss = 0.016085302457213402
Epoch 3
loss = 0.0158854890614748
Epoch 4
loss = 0.015727277845144272


In [173]:
val_losses = []
loss_function = nn.MSELoss()

with torch.no_grad():
    for x_val, y_val in test: 
        model.eval() 
        sentence_in = enc.sequence_words(x_val)
        yhat = model(sentence_in)
        val_loss = loss_function(y_val, yhat)
        val_losses.append(val_loss.item())
        print(x_val," : ",yhat.mean().item())

['every', 'programming', 'tutorial', '|', 'makes', 'your', 'computer', 'do', 'lol', 'that', 'really', 'me', 'lmao']  :  0.850274384021759
['flea', 'market', 'montgomery', '-', 'long', 'version', '|', 'that', 'lok', 'give', 'you', 'to', 'something', 'back']  :  0.8529284596443176
['january', 'sales', 'bean!', '|', 'mr', 'bean', 'funny', 'clips', '|', 'mr', 'bean', 'official', '|', "it's", 'really', 'how', 'mr', 'bean', 'out', 'of', 'his', 'way', 'to', 'that', 'very', 'nice']  :  0.856817901134491
['the', 'dirty', 'details', 'of', 'the', 'new', '$900b', 'stimulus', 'bill', '|', 'i', 'wish', 'the', 'and', 'would', 'go', 'to']  :  0.8490017056465149
['teho', 'teardo', '&', 'blixa', 'bargeld', '-', 'a', 'quiet', 'life', '(not', 'the', 'video)', '|', 'this', 'song', 'has', 'a', 'trying', 'to', 'something', 'but', 'never', 'to', 'from', 'what', 'really', 'is']  :  0.852142870426178
['trump', 'shortens', 'vacation,', 'returns', 'to', 'white', 'house', '|', 'is', 'so', 'he', "isn't", 'a', 'or',

 ---
 
 # Doing some classical Regression #

In [189]:
train_X, train_Y = [], []
test_X, test_Y = [], []

for x, y in train: 
    train_X.append(' '.join(x))
    train_Y.append(y.mean().item())
    
for x, y in test:
    test_X.append(' '.join(x))
    test_Y.append(y.mean().item())
     

In [192]:

import sklearn
import sklearn.feature_extraction
import sklearn.linear_model

vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))


tf_features_train = vectorizer.fit_transform(train_X)
tf_features_test = vectorizer.transform(test_X)
print (tf_features_train.shape, tf_features_test.shape)

train_labels = [1 if sentiment>0.5 else 0 for sentiment in train_Y]
test_labels = [1 if sentiment>0.5 else 0 for sentiment in test_Y]
print (len(train_labels), len(test_labels))

clf = sklearn.linear_model.LogisticRegression()
clf.fit(tf_features_train, train_labels)
print (clf)

(460, 550) (115, 550)
460 115
LogisticRegression()


In [193]:
predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

              precision    recall  f1-score   support

    Negative       1.00      0.43      0.60        14
    Positive       0.93      1.00      0.96       101

    accuracy                           0.93       115
   macro avg       0.96      0.71      0.78       115
weighted avg       0.94      0.93      0.92       115

[[  6   8]
 [  0 101]]
