<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/Baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Baseline #1:
The baseline is obtained exploiting a Multinomial Naive Bayes classifier. 
The actual code is partly taken from the SA dedicated laboratoy since I consider this part of relative importance.

In [223]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [224]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [225]:
nltk.download('subjectivity')

[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

In [226]:
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

sub = subjectivity
subj_docs = [sent for sent in subjectivity.sents(categories='subj')]
obj_docs = [sent for sent in subjectivity.sents(categories='obj')]
len(neg), len(pos)

(1000, 1000)

In [227]:
import numpy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report

vectorizer = CountVectorizer()
classifier = MultinomialNB()

In [228]:
pol_corpus = [" ".join([w for sent in d for w in sent]) for d in neg] + [" ".join([w for sent in d for w in sent]) for d in pos]
subj_corpus = [" ".join(sent) for sent in subj_docs] + [" ".join(sent) for sent in obj_docs]

# building sparse matrix with count vectors
vectors = vectorizer.fit_transform(subj_corpus)

subj_labels = numpy.array([0] * len(subj_docs) + [1] * len(obj_docs))
pol_labels = numpy.array([0] * len(neg) + [1] * len(pos))


In [229]:
# clf trained on subjectivity dataset
classifier.fit(vectors, subj_labels)

MultinomialNB()

In [230]:
pol_vectors = vectorizer.transform(pol_corpus)

In [231]:
# predicting subjective and objective sentences in polarity dataset
predictions = classifier.predict(pol_vectors)

In [245]:
import pandas as pd

df_pol_corpus = pd.DataFrame(pol_corpus)
df_pol_labels = pd.DataFrame(pol_labels)
df_pol_pred = pd.DataFrame(predictions)

In [233]:
df_pol_corpus.rename(columns={0:'text'}, inplace=True)
#df_pol_corpus

In [234]:
df_pol_labels.rename(columns={0:'labels'}, inplace=True)
#df_pol_labels

In [235]:
df_pol_pred.rename(columns={0:'predictions'}, inplace=True)
#df_pol_pred

In [236]:
df_pol = pd.concat([df_pol_corpus, df_pol_labels, df_pol_pred], axis=1)

In [237]:
#df_pol

In [238]:
# filtering for subjective sentences
df_pol=df_pol.loc[df_pol['predictions'] == 0]

In [239]:
#df_pol

In [240]:
df_pol_list = df_pol.text.values.tolist()
df_pol_label = df_pol.labels.values.tolist()

In [241]:
# instantiating a new vectorizer and classifier
pol_vec = CountVectorizer()
pol_clf = MultinomialNB()

In [242]:
pol_vectors = pol_vec.fit_transform(df_pol_list)

In [248]:
# 10-fold cross-validation
scores = cross_validate(pol_clf, pol_vectors, df_pol_label, cv=StratifiedKFold(n_splits=10), scoring=['accuracy'])
average = sum(scores['test_accuracy'])/len(scores['test_accuracy'])
print(f"Baseline : {round(average,2)} ACC")

Baseline : 0.82 ACC


### Improving baseline adding double negation flipping

In [249]:
from nltk.sentiment.util import mark_negation
from sklearn.svm import SVC
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from typing import List, Dict


"""
def mark_negation(document, double_neg_flip=False, shallow=False):
    Append _NEG suffix to words that appear in the scope between a negation
    and a punctuation mark.

    :param document: a list of words/tokens, or a tuple (words, label).
    :param shallow: if True, the method will modify the original document in place.
    :param double_neg_flip: if True, double negation is considered affirmation
        (we activate/deactivate negation scope every time we find a negation).
    :return: if `shallow == True` the method will modify the original document
        and return it. If `shallow == False` the method will return a modified
        document, leaving the original unmodified.

    >>> sent = "I didn't like this movie . It was bad .".split()
    >>> mark_negation(sent)
    ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.']
"""

def negative_marking(doc : List[str]) -> List[str]:
    '''
        Params :
        -----------------
            doc : list[str]
                document where each element is a list of strings
        Returns :
            negated_doc : list[str]
                document after having applied double negation
    '''

    flat_doc = [w for sent in doc for w in sent]
    negated_doc = mark_negation(flat_doc, double_neg_flip=True)

    return " ".join([w for w in negated_doc])

print(negative_marking(neg[0]))

plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what ' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . which is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn ' t snag this one correctly . they seem to have taken this pretty neat concept , but executed it terribly . so what are the problems with the movie ? well , its main problem is that it ' s simply too jumbled . it starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience member , have no idea

In [252]:
neg_vectorizer = CountVectorizer()
neg_classifier = MultinomialNB()

In [254]:
neg_pol_corpus = [negative_marking(d) for d in pos] + [negative_marking(d) for d in neg]

In [255]:
# building sparse matrix with count vectors
neg_vectors = neg_vectorizer.fit_transform(subj_corpus)

subj_labels = numpy.array([0] * len(subj_docs) + [1] * len(obj_docs))
pol_labels = numpy.array([0] * len(neg) + [1] * len(pos))

In [256]:
# clf trained on subjectivity dataset
neg_classifier.fit(neg_vectors, subj_labels)

MultinomialNB()

In [257]:
neg_pol_vectors = neg_vectorizer.transform(pol_corpus)

In [258]:
# predicting subjective and objective sentences in polarity dataset
neg_predictions = neg_classifier.predict(neg_pol_vectors)

In [259]:
import pandas as pd

df_pol_corpus = pd.DataFrame(neg_pol_corpus)
df_pol_labels = pd.DataFrame(pol_labels)
df_pol_pred = pd.DataFrame(neg_predictions)

In [260]:
df_pol_corpus.rename(columns={0:'text'}, inplace=True)

In [261]:
df_pol_labels.rename(columns={0:'labels'}, inplace=True)

In [262]:
df_pol_pred.rename(columns={0:'predictions'}, inplace=True)

In [263]:
df_pol = pd.concat([df_pol_corpus, df_pol_labels, df_pol_pred], axis=1)

In [264]:
df_pol=df_pol.loc[df_pol['predictions'] == 0]

In [265]:
df_pol_list = df_pol.text.values.tolist()
df_pol_label = df_pol.labels.values.tolist()

In [266]:
# instantiating a new vectorizer and classifier
neg_pol_vec = CountVectorizer()
neg_pol_clf = MultinomialNB()

In [267]:
neg_pol_vectors = neg_pol_vec.fit_transform(df_pol_list)

In [268]:
# 10-fold cross-validation
scores = cross_validate(neg_pol_clf, neg_pol_vectors, df_pol_label, cv=StratifiedKFold(n_splits=10), scoring=['accuracy'])
average = sum(scores['test_accuracy'])/len(scores['test_accuracy'])
print(f"Baseline : {round(average,2)} ACC")

Baseline : 0.82 ACC


# Baseline #2 :
## Exploring movie_reviews pack

[Source](https://medium.com/@joel_34096/sentiment-analysis-of-movie-reviews-in-nltk-python-4af4b76a6f3)


In [None]:
import nltk
from nltk.corpus import movie_reviews

In [None]:
# A list of all the words in 'movie_reviews'
words = movie_reviews.words()
words

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [None]:
# Prints total number of words in 'movie_reviews'
len(movie_reviews.words())

1583820

In [None]:
# Prints the polarity labels in the package
categories = movie_reviews.categories()
categories

['neg', 'pos']

In [None]:
# Displays frequency of words in ‘movie_reviews’
freq_dist_words = nltk.FreqDist(movie_reviews.words())

In [None]:
# Displays frequency of 15 most common words in ‘movie_reviews’
freq_dist_most_common = nltk.FreqDist(movie_reviews.words()).most_common(15)
freq_dist_most_common

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [None]:
# Prints all file ids
all_fileids = movie_reviews.fileids()
print(all_fileids[:50])

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt', 'neg/cv010_29063.txt', 'neg/cv011_13044.txt', 'neg/cv012_29411.txt', 'neg/cv013_10494.txt', 'neg/cv014_15600.txt', 'neg/cv015_29356.txt', 'neg/cv016_4348.txt', 'neg/cv017_23487.txt', 'neg/cv018_21672.txt', 'neg/cv019_16117.txt', 'neg/cv020_9234.txt', 'neg/cv021_17313.txt', 'neg/cv022_14227.txt', 'neg/cv023_13847.txt', 'neg/cv024_7033.txt', 'neg/cv025_29825.txt', 'neg/cv026_29229.txt', 'neg/cv027_26270.txt', 'neg/cv028_26964.txt', 'neg/cv029_19943.txt', 'neg/cv030_22893.txt', 'neg/cv031_19540.txt', 'neg/cv032_23718.txt', 'neg/cv033_25680.txt', 'neg/cv034_29446.txt', 'neg/cv035_3343.txt', 'neg/cv036_18385.txt', 'neg/cv037_19798.txt', 'neg/cv038_9781.txt', 'neg/cv039_5963.txt', 'neg/cv040_8829.txt', 'neg/cv041_22364.txt', 'neg/cv042_11927.txt', 'neg/cv043_16808.t

In [None]:
# Prints file ids of positive reviews
pos_fileids = movie_reviews.fileids('pos')
print(pos_fileids[:10])

['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt', 'pos/cv003_11664.txt', 'pos/cv004_11636.txt', 'pos/cv005_29443.txt', 'pos/cv006_15448.txt', 'pos/cv007_4968.txt', 'pos/cv008_29435.txt', 'pos/cv009_29592.txt']


In [None]:
# Prints file ids of negative reviews.
neg_fileids = movie_reviews.fileids('neg')
print(neg_fileids[:10])

['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


In [None]:
# Prints all words in movie_review with file id ‘neg/cv001_19502.txt’
movie_reviews.words('neg/cv001_19502.txt')[:10]

['the',
 'happy',
 'bastard',
 "'",
 's',
 'quick',
 'movie',
 'review',
 'damn',
 'that']

In [here](https://www.nltk.org/_modules/nltk/classify/naivebayes.html) is specified that NaiveBayesClassifier wants as input a dictionary of the form `(featureset,labels)` for :

```python
 @classmethod
    def train(cls, labeled_featuresets, estimator=ELEProbDist):
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
```

where `featureset` is a list of words.

In [None]:
# it is needed to build such structure
neg_doc = [(movie_reviews.words(mr_fileid), 'neg') for mr_fileid in neg_fileids]
neg_doc[:10]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg'),
 (['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...], 'neg'),
 (['so', 'ask', 'yourself', 'what', '"', '8mm', '"', '(', ...], 'neg'),
 (['that', "'", 's', 'exactly', 'how', 'long', 'the', ...], 'neg'),
 (['call', 'it', 'a', 'road', 'trip', 'for', 'the', ...], 'neg'),
 (['plot', ':', 'a', 'young', 'french', 'boy', 'sees', ...], 'neg')]

In [None]:
pos_doc = [(movie_reviews.words(mr_fileid), 'pos') for mr_fileid in pos_fileids]
pos_doc[:10]

[(['films', 'adapted', 'from', 'comic', 'books', 'have', ...], 'pos'),
 (['every', 'now', 'and', 'then', 'a', 'movie', 'comes', ...], 'pos'),
 (['you', "'", 've', 'got', 'mail', 'works', 'alot', ...], 'pos'),
 (['"', 'jaws', '"', 'is', 'a', 'rare', 'film', 'that', ...], 'pos'),
 (['moviemaking', 'is', 'a', 'lot', 'like', 'being', ...], 'pos'),
 (['on', 'june', '30', ',', '1960', ',', 'a', 'self', ...], 'pos'),
 (['apparently', ',', 'director', 'tony', 'kaye', 'had', ...], 'pos'),
 (['one', 'of', 'my', 'colleagues', 'was', 'surprised', ...], 'pos'),
 (['after', 'bloody', 'clashes', 'and', 'independence', ...], 'pos'),
 (['the', 'american', 'action', 'film', 'has', 'been', ...], 'pos')]

In [None]:
doc = pos_doc + neg_doc
doc[:10]

[(['films', 'adapted', 'from', 'comic', 'books', 'have', ...], 'pos'),
 (['every', 'now', 'and', 'then', 'a', 'movie', 'comes', ...], 'pos'),
 (['you', "'", 've', 'got', 'mail', 'works', 'alot', ...], 'pos'),
 (['"', 'jaws', '"', 'is', 'a', 'rare', 'film', 'that', ...], 'pos'),
 (['moviemaking', 'is', 'a', 'lot', 'like', 'being', ...], 'pos'),
 (['on', 'june', '30', ',', '1960', ',', 'a', 'self', ...], 'pos'),
 (['apparently', ',', 'director', 'tony', 'kaye', 'had', ...], 'pos'),
 (['one', 'of', 'my', 'colleagues', 'was', 'surprised', ...], 'pos'),
 (['after', 'bloody', 'clashes', 'and', 'independence', ...], 'pos'),
 (['the', 'american', 'action', 'film', 'has', 'been', ...], 'pos')]

In [None]:
pos_iter = (idx for idx, item in enumerate(doc) if item[1]=='pos')
idx_pos = next(pos_iter)  # <== index of first positive tagged element
print(f"First sentence index with positive category : {idx_pos}")

First sentence index with positive category : 0


In [None]:
neg_iter = (idx for idx, item in enumerate(doc) if item[1]=='neg')
idx_neg = next(neg_iter)  # <== index of first negative tagged element
print(f"First sentence index with negative category : {idx_neg}")

First sentence index with negative category : 1000


To use NaiveBayesClassifier, as mentioned before, it is needed to have a dictionary of the form `feature_name : True/False` to indicate if a certain feature is whether present or not in a certain sentence (as pointed out [here](https://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis))

In [None]:
from typing import Dict, List

def arrangeInput(lw: List[str], all_words: List[str]) -> Dict:
  ''' This function create a dictionary of the form {key : bool_value} where "key" is a word among all the possible ones and "bool_values" 
      points out the presence or absense of the current word in the current sentence

      Params :
      --------
        lw : list(str)
             list of words appearing in a sentence
        all_words : list(str)
              corresponds to the all possible words that appear in the document
      
      Returns :
      --------
        dictionary of the form {word : True/False} 
  '''
  
  all_words = set(all_words) # to avoid controlling over already seen words
  lw = set(lw)
  dictionary = {}
  for w in all_words:
    dictionary[w] = w in lw
  return dictionary

In [None]:
# check functioning of arrangeInput 
for item in arrangeInput(doc[0][0], words).values():
  if item == True:
    print('Done')
    break

Done


In [None]:
# to shorten output just consider the first sentence of the first doc
counter = 0
for (key, value) in arrangeInput(doc[0][0], words).items():
  print(key,value)
  counter +=1
  if counter == 10:
    break

0009f False
blabbed False
guarding False
emitted False
shih False
mano False
populous False
handphones False
interrelated False
pseudoerotic False


In [None]:
# to display whole dictionary for first sentence of first doc uncomment & run following line
# arrangeInput(doc[0][0],words)

`NB_doc` will be fed into a NaiveBayesClassifier. The data structure needed to word with NBC is the following [[found here](https://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis)]:
```pyhton
  [({'this': True, 'love': True, 'deal': False, 'tired': False, 'feel': False, 'is': False, 'am': False, 'an': False, 'sandwich': True, 'ca': False, 'best': False, '!': False, 'what': False, '.': True, 'amazing': False, 'horrible': False, 'sworn': False, 'awesome': False, 'do': False, 'good': False, 'very': False, 'boss': False, 'beers': False, 'not': False, 'with': False, 'he': False, 'enemy': False, 'about': False, 'like': False, 'restaurant': False, 'these': False, 'of': False, 'work': False, "n't": False, 'i': False, 'stuff': False, 'place': False, 'my': False, 'view': False}, 'pos'), . . .]
```


In [None]:
# iterating over the tuples (list of words, cat) in doc and creating a new list of tuples of (dictionaries, polarity tag)
# here I consider 1ML  of possible words to save comp. power
NB_doc = [(arrangeInput(sent, words[:800000]), cat) for (sent, cat) in doc]

`arranged_doc` is used to be fed to an SVC.

In [None]:
# here I consider all possible words
arranged_doc = [(arrangeInput(lw, words), cat) for (lw, cat) in doc]

In [None]:
# saving the list to avoid running previous cell (time demanding)
import json
from typing import List, Dict, Tuple

def save_arrangedDoc(arranged_doc : List[Tuple[Dict, str]], filename : str) -> None:
  ''' This function saves a document

    Params :
    --------
      arranged_doc : list(tuple(dict,str))
        document arranged as list of tuples of dictionary ({word: value}, category). 
    
    Returns :
    ---------
      None
  '''

  with open(filename, "w") as fp:
    json.dump(arranged_doc, fp)

def load_arrangedDoc(arranged_doc : str):
  ''' This function load a document

    Params :
    --------
      arranged_doc : str
        path to document to load
    
    Returns :
    ---------
      None
  '''
  
  with open(arranged_doc, "r") as fp:
    b = json.load(fp)
  

NaiveBayesClassifier

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection
from nltk.classify import NaiveBayesClassifier

# Splitting into training and testing sets
train_set, test_set = model_selection.train_test_split(NB_doc, test_size = 0.2)
clf = NaiveBayesClassifier.train(train_set)
accuracy = nltk.classify.accuracy(clf, test_set)

print(f"NBC : {round(accuracy,2)} ACC")

NBC : 0.79 ACC


In [None]:
clf.show_most_informative_features()

Most Informative Features
               insulting = True              neg : pos    =     13.8 : 1.0
                  seagal = True              neg : pos    =     12.4 : 1.0
             outstanding = True              pos : neg    =     11.6 : 1.0
               stupidity = True              neg : pos    =     10.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                   kudos = True              pos : neg    =     10.3 : 1.0
               maintains = True              pos : neg    =     10.3 : 1.0
                  hudson = True              neg : pos    =      9.7 : 1.0
               laughably = True              neg : pos    =      9.7 : 1.0
              schumacher = True              neg : pos    =      9.7 : 1.0


# Baseline #3 

In [None]:
import nltk


nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import movie_reviews


mr = movie_reviews
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')

In [None]:
from nltk.corpus import stopwords


print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
words = movie_reviews.words()
categoris = movie_reviews.categories()

In [None]:
neg_doc = [(movie_reviews.words(mr_fileid), 'neg') for mr_fileid in movie_reviews.fileids('neg')]
pos_doc = [(movie_reviews.words(mr_fileid), 'pos') for mr_fileid in movie_reviews.fileids('pos')]

In [None]:
doc = neg_doc + pos_doc

In [None]:
doc[:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [None]:
from textblob import TextBlob as tb
import string
from typing import List, Dict, Tuple


def prepare_data(cw : List[str]) -> List[str]:
  '''Clear text from numbers, stop words (very common words), punctuation and 
    correct possible misspelled words

    Params :
    --------
      cw : List[str]
        list of words in the sentence to be cleaned
    Returns :
    ---------
      list of cleaned words
  '''

  stop_list = set(stopwords.words("english"))
  new_corpus = [word for word in cw if word not in stop_list]
  new_corpus = [word.translate(str.maketrans('','',string.punctuation)) for word in new_corpus]
  new_corpus = [word.translate(str.maketrans('','','1234567890')) for word in new_corpus]

  new_corpus = [str(tb(word).correct()) for word in new_corpus]

  return new_corpus


In [None]:
def clean_doc(doc : List[Tuple[List[str], str]]) -> List[Tuple[List[str],str]]:
  new_doc = [(prepare_data(lw),cat) for (lw, cat) in doc]
  return new_doc

In [None]:
# saving the list to avoid running previous cell (time demanding)
import json
from typing import List, Dict, Tuple

def save_arrangedDoc(arranged_doc : List[Tuple[List[str], str]], filename : str) -> None:
  ''' This function saves a document

    Params :
    --------
      arranged_doc : list(tuple(dict,str))
        document arranged as list of tuples of dictionary ({word: value}, category). 
    
    Returns :
    ---------
      None
  '''

  with open(filename, "w") as fp:
    json.dump(arranged_doc, fp)

def load_arrangedDoc(arranged_doc : str):
  ''' This function load a document

    Params :
    --------
      arranged_doc : str
        path to document to load
    
    Returns :
    ---------
      None
  '''
  
  with open(arranged_doc, "r") as fp:
    b = json.load(fp)
  

In [None]:
cleaned_doc = clean_doc(doc[:5])

In [None]:
print(cleaned_doc)

In [None]:
# complete document
cleaned_doc_complete = clean_doc(doc)

In [None]:
save_arrangedDoc(cleaned_doc_complete, 'cleaned_doc')

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('omw-1.4')

In [None]:
def lemmatize_sent(lw : List[str]) -> List[str]:
  '''Converiting verbs to base form 
    Example : studying -> study

    Params :
    --------
      lw : List[str]
        list of words in the sentence
    Returns :
    ---------
      list of lemmatized words
  '''

  lemmatizer = nltk.stem.WordNetLemmatizer()
  return [lemmatizer.lemmatize(w) for w in lw]

In [None]:
def lemmatize_doc(doc : List[Tuple[List[str],str]]) -> List[Tuple[List[str],str]]:
  new_doc = [(lemmatize_sent(lw),cat) for (lw, cat) in doc]
  return new_doc

In [None]:
lemm_doc = lemmatize_doc(cleaned_doc)

In [None]:
print(lemm_doc[0])

In [None]:
# lemmatize complete doc
lemm_doc_complete = lemmatize_doc(cleaned_doc_complete)

In [None]:
save_arrangedDoc(lemm_doc_complete, 'lemmatized_doc')

## Not used

In [None]:
!pip install scattertext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scattertext
  Downloading scattertext-0.1.6-py3-none-any.whl (7.3 MB)
[K     |████████████████████████████████| 7.3 MB 3.7 MB/s 
Collecting gensim>=4.0.0
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 50.1 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9309 sha256=0bcefccd8a008cfc5eed2f2f623420a7ba2f2409abba3ce0e153eb68c3641207
  Stored in directory: /root/.cache/pip/wheels/cb/19/58/4e8fdd0009a7f89dbce3c18fff2e0d0fa201d5cdfd16f113b7
Successfully built flashtext
Installing collected packages: mock, gensi

In [None]:
import scattertext as st
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import produce_scattertext_explorer
import pandas as pd

# works only with pd
corpus = st.CorpusFromParsedDocuments(pd.from_text(lemm_doc), category_col='label').build()
html = st.produce_scattertext_explorer(corpus, category='Positive', category_name='Negative', not_category_name = 'Positive', minimum_term_frequency=1, width_in_pixel=1000, transform=st.Scalers.log_scale_standardize)
file_name = 'Sentimental Words Visualization.html'
open(file_name,'wb').write(HTML.encode('utf-8'))
IFrame(src=file_name, width=1000, height=700)

##Used

In [None]:
([[" ".join(sent)] for (sent,cat) in lemm_doc])[:2]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform([" ".join(sent) for (sent,cat) in lemm_doc])

In [None]:
def encode_labels(doc : List[Tuple[List,str]]) -> List[int]:
  labels = [1 if cat=='pos' else 0 for (_, cat) in doc]
  return labels

print(encode_labels(doc))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(features, encode_labels(doc[:5]), test_size = 0.2)
clf = lgb.LGBMClassifier(max_depth=20, n_estimators=25, min_child_weight=0.0016, n_jobs=-1)

clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print(f'Test data accuracy is : {accuracy_score(y_test, pred)}')
print(classification_report(y_test, pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


cm = confusion_matrix(y_test, pred)

#fig, ax = plt.subplots(figsize=(8,6))
#sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


df_cm = pd.DataFrame(cm, index = list(set(encode_labels(lemm_doc))), columns = list(set(encode_labels(lemm_doc))))
# plt.figure(figsize=(10,7))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

# MLT and BERT Embedding [source](https://arxiv.org/pdf/2201.05363.pdf)

In [None]:
from nltk.sentiment.util import mark_negation
from typing import List, Dict

def negative_marking(doc : List[str]) -> List[str]:
    '''
        Params :
        -----------------
            doc : list[str]
                document where each element is a list of strings
        Returns :
            negated_doc : list[str]
                document after having applied double negation
    '''

    flat_doc = [w for sent in doc for w in sent]
    negated_doc = mark_negation(flat_doc, double_neg_flip=True)

    return " ".join([w for w in negated_doc])

In [None]:
from textblob.en import subjectivity
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('subjectivity')


mr = movie_reviews
sub = subjectivity
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')


subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

print(len(neg), len(pos))
print(len(subj_docs), len(obj_docs))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


1000 1000
5000 5000


In [None]:
new_corpus = [negative_marking(d) for d in pos] + [negative_marking(d) for d in neg]

In [None]:
import pandas as pd

data = pd.DataFrame(new_corpus, columns=['text'])

In [None]:
print(data)

                                                   text
0     films adapted from comic books have had plenty...
1     every now and then a movie comes along from a ...
2     you ' ve got mail works alot better than it de...
3     " jaws " is a rare film that grabs your attent...
4     moviemaking is a lot like being the general ma...
...                                                 ...
1995  if anything , " stigmata " should be taken as ...
1996  john boorman ' s " zardoz " is a goofy cinemat...
1997  the kids in the hall are an acquired taste . i...
1998  there was a time when john carpenter was a gre...
1999  two party guys bob their heads to haddaway ' s...

[2000 rows x 1 columns]


In [None]:
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob as tb



nltk.download('stopwords')

## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
# since re are largely used for this type of applications, the regex module is used
# re.sub(pattern, repl, string, count=0, flags=0)

def pre_processing(text : str) -> str :
  '''Clear text from numbers, stop words (very common words), punctuation and 
    correct possible misspelled words

    Params :
    --------
      cw : List[str]
        list of words in the sentence to be cleaned
    Returns :
    ---------
      list of cleaned words
  '''

  stop_list = set(stopwords.words("english"))
  text = list(text.lower().split())
  text = ' '.join([word for word in text if word not in stop_list])
  # remove http links
  text = re.sub(r'http\S+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w*', '', text)
  # Remove whitespace (including new line characters)
  text = re.sub(r'\s\s+', '', text)
  # Remove single space remaining at the front of the tweet.
  text = text.lstrip(' ') 
  # Remove @username
  text = re.sub('@[^\s]+','', text)
  text = list(text.translate(str.maketrans('', '', string.punctuation)).split())
  # correction of possible miss-click
  text = ' '.join([str(tb(word).correct()) for word in text])

  return text



pre_processing('it is a beautiful lif, https://github @matthew')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'beautiful if'

In [None]:
data['text'] = data['text'].apply(pre_processing)
print(data)

In [None]:
data

In [None]:
labels = [[1,0]] * len(data['text']//2) + [[0,1]] * len(data['text']//2)
len(labels)

In [None]:
df = pd.DataFrame(labels, columns=['pos', 'neg'])

In [None]:
df

In [None]:
complete_data = pd.concat([data, df], axis=1)
#ahaha