In [None]:
# Varaible  for which dataset to examine
dataset_to_analyze = "./data/processed/dataset_50.0%.csv"

In [1]:
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import random
import pandas as pd
from os import listdir

nltk.download('stopwords')
nltk.download('punkt')

exclude = set(string.punctuation)
negation = set(["aren't", "isn't", "wasn't", "weren't", "can't", "couldn't", 
    "mustn't", "shouldn't", "won't", "wouldn't", "didn't", "doesn't", "don't", 
    "hasn't", "haven't", "hadn't", "not"])
stop_words = set(stopwords.words('english'))

def remove_punctuation(str_to_alter):
    global exclude
    new_string = ''.join(ch for ch in str_to_alter if ch not in exclude)

    return new_string

def has_digits(s):
    return any(ch.isdigit() for ch in s)

def is_negation(s):
    global negation

    return s in negation

def is_stopword(s):
    global stop_words

    return s in stop_words

class Sample:
    def __init__(self, body, target_class):
        self.body = body.lower()
        self.length = len(body)
        self.target_class = int(target_class)
        self.exclamation_point_count = self.body.count('!')
        self.question_mark_count = self.body.count('?')
        word_tokens = word_tokenize(self.body) 
        self.tokens = list()

        word_tokens = word_tokenize(self.body) 
            
        filtered_sentence = [] 
        
        for w in word_tokens: 
            if not is_stopword(w): 
                filtered_sentence.append(w) 

        ps =PorterStemmer()
        negation = False
        for w in filtered_sentence:
            root_word=w
            if root_word == "n't" or root_word == 'not':
                negation = not negation
                continue
            root_word = remove_punctuation(root_word)
            if root_word == 'br':
                negation = False
                continue
            elif len(root_word) < 2:
                negation = False
                continue
            elif has_digits(root_word):
                negation = False
                continue
            elif is_stopword(root_word):
                negation = False
                continue
            if negation: 
                root_word = "".join(('!',root_word))
                negation = False
            self.tokens.append(root_word)

    def __str__(self) -> str:
        return self.body if len(self.body) < 100 else self.body[:100]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\isacm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\isacm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
pos_dataset = list()
neg_dataset = list()
test_dataset = list()

with open(dataset_to_analyze, 'r', encoding='utf8') as file:
    file_content = file.readlines()

file_content = file_content[1:]

df = pd.read_csv(f"./data/tweets.csv")

pos_tweets = []
neg_tweets = []

random.seed(0)
for b, s in zip(df['text'], df['sentiment']):
    if not isinstance(b, str):
        break
    if s == 'neutral':
        continue
    x = Sample(b, 1 if s == 'positive' else 0)
    if x.target_class == 1:
        pos_tweets.append(x)
    else:
        neg_tweets.append(x)

for row in file_content:
    raw_sample = row.rstrip().split('\t')
    if len(raw_sample[1]) > 5000:
        continue
    if random.random() < 0.25:
        test_dataset.append(Sample(raw_sample[1], 0 if raw_sample[0] == '5.0' else 1))
    elif raw_sample[0] == '1.0':
        neg_dataset.append(Sample(raw_sample[1], 0))
    else:
        pos_dataset.append(Sample(raw_sample[1], 1))

print(len(pos_dataset), len(neg_dataset), len(test_dataset))

5530 5514 3677


In [3]:
from collections import Counter
pos_counter_dict = Counter(token for sample in pos_dataset for token in set(sample.tokens))
neg_counter_dict = Counter(token for sample in neg_dataset for token in set(sample.tokens))

for dataset in [pos_dataset, neg_dataset]:
    ex, ques = 0,0
    for sample in dataset:
        ex += sample.exclamation_point_count
        ques += sample.question_mark_count

    print(ex, ques)

dif_list = []
seen = set()

for counter_dict in [pos_counter_dict, neg_counter_dict]:
    for k in sorted(counter_dict, key=counter_dict.get, reverse=True):
        if k in seen:
            continue
        seen.add(k)
        dif_list.append((k, pos_counter_dict[k] - neg_counter_dict[k]))

dif_list = sorted(dif_list, key=lambda x: x[1], reverse=True)




4095 212
3284 662


In [4]:
print(f"Avg length for positive reviews: {sum(s.length for s in pos_dataset)/len(pos_dataset)}")
print(f"Avg length for negative reviews: {sum(s.length for s in neg_dataset)/len(neg_dataset)}")
print(f"Avg no '!'/positive reviews: {sum(s.exclamation_point_count for s in pos_dataset)/len(pos_dataset)}")
print(f"Avg no '!'/negative reviews: {sum(s.exclamation_point_count for s in neg_dataset)/len(neg_dataset)}")
print(f"Avg no '?'/positive reviews: {sum(s.question_mark_count for s in pos_dataset)/len(pos_dataset)}")
print(f"Avg no '?'/negative reviews: {sum(s.question_mark_count for s in neg_dataset)/len(neg_dataset)}")

print("\n" + dataset_to_analyze)

print('+' +'-'*23+'+'+'-'*23+ '+')
print("| Best (pos-neg)".ljust(23)+' |'+' Worst (neg-pos)'.ljust(23) + '|')
print('|' +'-'*23+'+'+'-'*23+ '|')
for best, worst in zip(dif_list[:10], dif_list[:-10:-1]):
    print(f'| {best[0].ljust(15)}{str(best[1]).rjust(6)} ', end="|")
    print(f' {worst[0].ljust(15)}{str(-worst[1]).rjust(6)} ', end="|\n")
print('+' +'-'*23+'+'+'-'*23+ '+')

Avg length for positive reviews: 268.0598553345389
Avg length for negative reviews: 326.9461371055495
Avg no '!'/positive reviews: 0.740506329113924
Avg no '!'/negative reviews: 0.5955749002538991
Avg no '?'/positive reviews: 0.03833634719710669
Avg no '?'/negative reviews: 0.12005803409503082

+-----------------------+-----------------------+
| Best (pos-neg)        | Worst (neg-pos)       |
|-----------------------+-----------------------|
| great            1137 | would             613 |
| love              810 | money             552 |
| loves             470 | disappointed      490 |
| easy              401 | one               350 |
| perfect           380 | product           336 |
| well              304 | return            310 |
| little            266 | even              297 |
| best              256 | back              296 |
| nice              255 | broke             291 |
+-----------------------+-----------------------+


In [5]:
from collections import Counter
pos_counter_dict = Counter(token for sample in pos_tweets for token in set(sample.tokens))
neg_counter_dict = Counter(token for sample in neg_tweets for token in set(sample.tokens))

for dataset in [pos_tweets, neg_tweets]:
    ex, ques = 0,0
    for sample in dataset:
        ex += sample.exclamation_point_count
        ques += sample.question_mark_count

    print(ex, ques)

dif_list = []
seen = set()

for counter_dict in [pos_counter_dict, neg_counter_dict]:
    for k in sorted(counter_dict, key=counter_dict.get, reverse=True):
        if k in seen:
            continue
        seen.add(k)
        dif_list.append((k, pos_counter_dict[k] - neg_counter_dict[k]))

dif_list = sorted(dif_list, key=lambda x: x[1], reverse=True)


888 94
452 139


In [6]:
print(f"Avg length for positive reviews: {sum(s.length for s in pos_tweets)/len(pos_tweets)}")
print(f"Avg length for negative reviews: {sum(s.length for s in neg_tweets)/len(neg_tweets)}")
print(f"Avg no '!'/positive reviews: {sum(s.exclamation_point_count for s in pos_tweets)/len(pos_tweets)}")
print(f"Avg no '!'/negative reviews: {sum(s.exclamation_point_count for s in neg_tweets)/len(neg_tweets)}")
print(f"Avg no '?'/positive reviews: {sum(s.question_mark_count for s in pos_tweets)/len(pos_tweets)}")
print(f"Avg no '?'/negative reviews: {sum(s.question_mark_count for s in neg_tweets)/len(neg_tweets)}")

print("\nTweets:")

print('+' +'-'*23+'+'+'-'*23+ '+')
print("| Best (pos-neg)".ljust(23)+' |'+' Worst (neg-pos)'.ljust(23) + '|')
print('|' +'-'*23+'+'+'-'*23+ '|')
for best, worst in zip(dif_list[:10], dif_list[:-10:-1]):
    print(f'| {best[0].ljust(15)}{str(best[1]).rjust(6)} ', end="|")
    print(f' {worst[0].ljust(15)}{str(-worst[1]).rjust(6)} ', end="|\n")
print('+' +'-'*23+'+'+'-'*23+ '+')

Avg length for positive reviews: 68.71260199456029
Avg length for negative reviews: 69.62737262737262
Avg no '!'/positive reviews: 0.8050770625566637
Avg no '!'/negative reviews: 0.4515484515484515
Avg no '?'/positive reviews: 0.08522212148685404
Avg no '?'/negative reviews: 0.13886113886113885

+-----------------------+-----------------------+
| Best (pos-neg)        | Worst (neg-pos)       |
|-----------------------+-----------------------|
| love              124 | sad                54 |
| good              113 | miss               53 |
| day               100 | sorry              38 |
| happy              88 | bad                34 |
| thanks             73 | hate               33 |
| great              52 | feel               30 |
| nice               45 | im                 30 |
| mother             36 | work               30 |
| http               35 | sucks              24 |
+-----------------------+-----------------------+


In [7]:
all_words = [x[0] for x in dif_list]
tokwords = []

for word in neg_dataset[191].tokens:
    tokwords.append((word, all_words.index(word)))

print(sorted(tokwords, key=lambda x: x[1], reverse=True))

ValueError: '!usually' is not in list

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from itertools import combinations

all_samples = pos_dataset + neg_dataset
x_train = []
y_train = []

random.shuffle(all_samples)

none_value = int(len(all_words)/2)

for sample in all_samples:
    feat = []
    feat.append(float(sample.length))
    feat.append(float(sample.exclamation_point_count))
    feat.append(float(sample.question_mark_count))
    for token in sample.tokens:
        feat.append(all_words.index(token))

    for i in range(500-len(feat)):
        feat.append(none_value)

    x_train.append(np.array(feat))
    y_train.append(sample.target_class)

x_train = pd.DataFrame(x_train, columns=None)

In [None]:
print(x_train)


In [None]:
x_test = []
y_test = []

for sample in test_dataset:
    feat = []
    feat.append(float(sample.length))
    feat.append(float(sample.exclamation_point_count))
    feat.append(float(sample.question_mark_count))
    for token in sample.tokens:
        try:
            feat.append(all_words.index(token))
        except ValueError:
            feat.append(none_value)

    for i in range(500-len(feat)):
        feat.append(none_value)

    x_test.append(np.array(feat))
    y_test.append(sample.target_class)

x_test = pd.DataFrame(x_test, columns=None)

In [None]:
# Random forest classification

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(stop_words='english')

x_train_raw = [s.body.strip().lower() for s in all_samples]
x_test_raw = [s.body.strip().lower() for s in test_dataset]

from sklearn.feature_extraction.text import TfidfVectorizer# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
x_train = vectorizer.fit_transform(x_train_raw)
x_test = vectorizer.transform(x_test_raw)



#x_train = vec.fit_transform(x_train_raw).toarray()
#x_test = vec.transform(x_test_raw).toarray()

model = MultinomialNB()
model.fit(x_train, y_train)
#model.score(np.array(x_test), np.array(y_test))

rf = RandomForestClassifier(n_estimators=500 ,criterion='entropy', random_state=0, verbose=2, n_jobs=8)
mlp = MLPClassifier(hidden_layer_sizes=200, batch_size=64, max_iter=1000000, random_state=0, verbose=True)

#classifier = SVC(kernel='poly', C=1,gamma=0.001)

# Metrics for evaluation 
scoring = ['accuracy', 'f1', 'estimator']
# Cross validation with Random forest
#estimator = rf.fit(x_train, y_train)
 
rf = rf.fit(x_train, y_train)
mlp = mlp.fit(x_train, y_train)

# Mean values saved into variabels

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500building tree 4 of 500
building tree 5 of 500

building tree 6 of 500building tree 7 of 500

building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500building tree 35 of 500

building tree 36 of 500
building tree 37 of 500
building tree 38 of 500

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.5s



building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65 of 500
building tree 66 of 500
building tree 67 of 500
building tree 68 of 500
building tree 69 of 500
building tree 70 of 500
building tree 71 of 500
building tree 72 of 500
building tree 73 of 500
building tree 74 of 500
building tree 75 of 500
building tree 76 of 500
building tree 77 of 500building tree 78 of 500

building tree 79 of 500
building tree 8

[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    2.9s


building tree 161 of 500
building tree 162 of 500
building tree 163 of 500
building tree 164 of 500
building tree 165 of 500
building tree 166 of 500
building tree 167 of 500
building tree 168 of 500
building tree 169 of 500
building tree 170 of 500
building tree 171 of 500
building tree 172 of 500
building tree 173 of 500
building tree 174 of 500
building tree 175 of 500
building tree 176 of 500
building tree 177 of 500
building tree 178 of 500
building tree 179 of 500
building tree 180 of 500
building tree 181 of 500
building tree 182 of 500
building tree 183 of 500
building tree 184 of 500
building tree 185 of 500
building tree 186 of 500
building tree 187 of 500
building tree 188 of 500
building tree 189 of 500
building tree 190 of 500
building tree 191 of 500
building tree 192 of 500
building tree 193 of 500
building tree 194 of 500
building tree 195 of 500
building tree 196 of 500
building tree 197 of 500
building tree 198 of 500
building tree 199 of 500
building tree 200 of 500


[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    7.3s



building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500
building tree 382 of 500
building tree 383 of 500
building tree 384 of 500building tree 385 of 500

building tree 386 of 500
building tree 387 of 500
building tree 388 of 500
building tree 389 of 500
building tree 390 of 500
building tree 391 of 500
building tree 392 of 500
building tree 393 of 500
building tree 394 of 500
building tree 395 of 500
building tree 396 of 500
building tree 397 of 500
building tree 398 of 500
building tree 399 of 500
building tree 400 of 500
building tree 401 of 500
building tree 402 of 500
building tree 403 of 500

[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   10.4s finished


Iteration 1, loss = 0.42339344
Iteration 2, loss = 0.17082055
Iteration 3, loss = 0.10996346
Iteration 4, loss = 0.07787020
Iteration 5, loss = 0.05759813
Iteration 6, loss = 0.04272008
Iteration 7, loss = 0.03237276
Iteration 8, loss = 0.02503332
Iteration 9, loss = 0.01931230
Iteration 10, loss = 0.01533860
Iteration 11, loss = 0.01243167
Iteration 12, loss = 0.01040594
Iteration 13, loss = 0.00900792
Iteration 14, loss = 0.00787950
Iteration 15, loss = 0.00705893
Iteration 16, loss = 0.00644181
Iteration 17, loss = 0.00598512
Iteration 18, loss = 0.00563271
Iteration 19, loss = 0.00534189
Iteration 20, loss = 0.00509313
Iteration 21, loss = 0.00488912
Iteration 22, loss = 0.00470208
Iteration 23, loss = 0.00455330
Iteration 24, loss = 0.00440325
Iteration 25, loss = 0.00427390
Iteration 26, loss = 0.00414444
Iteration 27, loss = 0.00403350
Iteration 28, loss = 0.00392746
Iteration 29, loss = 0.00383388
Iteration 30, loss = 0.00372773
Iteration 31, loss = 0.00363631
Iteration 32, los

In [None]:
res = rf.predict(x_test)

tp, tn, fp, fn = 0,0,0,0

for x,y in zip(res, y_test):
    if x == 1:
        if y == 1:
            tp +=1
        else:
            fp += 1
    else:
        if y == 1:
            fn += 1
        else:
            tn +=1

print(tp, tn, fp, fn)

res = mlp.predict(x_test)

tp, tn, fp, fn = 0,0,0,0

for x,y in zip(res, y_test):
    if x == 1:
        if y == 1:
            tp +=1
        else:
            fp += 1
    else:
        if y == 1:
            fn += 1
        else:
            tn +=1

print(tp, tn, fp, fn)

res = model.predict(x_test)

tp, tn, fp, fn = 0,0,0,0

for x,y in zip(res, y_test):
    if x == 1:
        if y == 1:
            tp +=1
        else:
            fp += 1
    else:
        if y == 1:
            fn += 1
        else:
            tn +=1

print(tp, tn, fp, fn)

print(rf.score(x_test,y_test))
print(mlp.score(x_test,y_test))
print(model.score(x_test, y_test))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished


336 59 2390 892
181 215 2234 1047
465 34 2415 763


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished


0.10742453086755507
0.10769649170519445
0.13570845798205058
