<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/pre-processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

# MTL and BERT Embedding

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [3]:
from nltk.sentiment.util import mark_negation
from typing import List, Dict, Tuple

def negative_marking(doc : List[str]) -> List[str]:
    '''
        Params :
        -----------------
            doc : list[str]
                document where each element is a list of strings
        Returns :
            negated_doc : list[str]
                document after having applied double negation
    '''

    flat_doc = [w for sent in doc for w in sent]
    negated_doc = mark_negation(flat_doc, double_neg_flip=True)

    return " ".join([w for w in negated_doc])

In [14]:
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob as tb


## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
# since re are largely used for this type of applications, the regex module is used
# re.sub(pattern, repl, string, count=0, flags=0)

def pre_processing(text : str) -> str :
  '''Clear text from numbers, stop words (very common words), punctuation and 
    correct possible misspelled words

    Params :
    --------
      cw : List[str]
        list of words in the sentence to be cleaned
    Returns :
    ---------
      list of cleaned words
  '''

  stop_list = set(stopwords.words("english"))
  text = list(text.lower().split())
  text = ' '.join([word for word in text if word not in stop_list])
  # remove http links
  text = re.sub(r'http\S+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w*', '', text)
  # Remove whitespace (including new line characters)
  text = re.sub(r'\s\s+', '', text)
  # Remove single space remaining at the front of the tweet.
  text = text.lstrip(' ') 
  # Remove @username
  text = re.sub('@[^\s]+','', text)
  text = list(text.translate(str.maketrans('', '', string.punctuation)).split())
  # correction of possible miss-click
  #text = ' '.join([str(tb(word).correct()) for word in text])
  text = ' '.join([str(word) for word in text])

  return text

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('subjectivity')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!


True

In [7]:
from textblob.en import subjectivity
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity


mr = movie_reviews
sub = subjectivity
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')


subj_docs = [sent for sent in subjectivity.sents(categories='subj')]
obj_docs = [sent for sent in subjectivity.sents(categories='obj')]

print(len(neg), len(pos))
print(len(subj_docs), len(obj_docs))

1000 1000
5000 5000


### Polarity Dataset Analysis

In [8]:
new_corpus = [negative_marking(d) for d in pos] + [negative_marking(d) for d in neg]

In [9]:
import pandas as pd

data = pd.DataFrame(new_corpus, columns=['text'])

In [10]:
data

Unnamed: 0,text
0,films adapted from comic books have had plenty...
1,every now and then a movie comes along from a ...
2,you ' ve got mail works alot better than it de...
3,""" jaws "" is a rare film that grabs your attent..."
4,moviemaking is a lot like being the general ma...
...,...
1995,"if anything , "" stigmata "" should be taken as ..."
1996,"john boorman ' s "" zardoz "" is a goofy cinemat..."
1997,the kids in the hall are an acquired taste . i...
1998,there was a time when john carpenter was a gre...


In [15]:
pre_processing('it is a beautiful lif, https://github @matthew')

'beautiful lif'

In [16]:
data['text'] = data['text'].apply(pre_processing)

In [17]:
data

Unnamed: 0,text
0,films adapted comic books plenty success wheth...
1,every movie comes along suspect studio every i...
2,got mail works alot better deserves order make...
3,jaws rare film grabs attention shows single im...
4,moviemaking lot like general manager nfl team ...
...,...
1995,anything stigmata taken warning releasing simi...
1996,john boorman zardoz goofy cinematic debacle fu...
1997,kids hall acquired taste took least season wat...
1998,time john carpenter great horror director cour...


In [18]:
labels = [[1,0]] * (len(data['text'])//2) + [[0,1]] * (len(data['text'])//2)
len(labels)

2000

In [37]:
labels = [1 if idx<(len(data['text'])//2) else 0 for idx in range(len(data['text']))]

In [38]:
len(labels[0:1000])

1000

In [40]:
len(labels[1000:])

1000

In [41]:
df = pd.DataFrame(labels, columns=['labels'])

In [45]:
complete_data = pd.concat([data, df], axis=1)

In [46]:
complete_data

Unnamed: 0,text,labels
0,films adapted comic books plenty success wheth...,1
1,every movie comes along suspect studio every i...,1
2,got mail works alot better deserves order make...,1
3,jaws rare film grabs attention shows single im...,1
4,moviemaking lot like general manager nfl team ...,1
...,...,...
1995,anything stigmata taken warning releasing simi...,0
1996,john boorman zardoz goofy cinematic debacle fu...,0
1997,kids hall acquired taste took least season wat...,0
1998,time john carpenter great horror director cour...,0


In [47]:
# saving built dataset
complete_data.to_csv(rootdir+'movie_rews.csv')

This to show that $99\%$ of the sentences are under $947.14$ words.

In [48]:
import numpy as np
from nltk.tokenize import word_tokenize


seq_len = np.array([len(word_tokenize(sent)) for sent in complete_data['text']])
print([(p, np.percentile(seq_len, p)) for p in [65, 75, 80, 85, 90, 95, 99, 100]])

[(65, 418.0), (75, 463.25), (80, 494.20000000000005), (85, 535.0), (90, 608.1000000000001), (95, 720.05), (99, 947.1499999999999), (100, 1483.0)]


In [49]:
avg_length = seq_len.mean()
avg_length

386.225

### Subjectivity Dataset Analysis

In [56]:
subj_corpus = [" ".join(sent) for sent in subj_docs] + [" ".join(sent) for sent in obj_docs]
subj_labels = np.array([1] * len(subj_docs) + [0] * len(obj_docs))

In [57]:
subj_corpus_df = pd.DataFrame(subj_corpus, columns=['text'])
subj_labels_df = pd.DataFrame(subj_labels, columns=['labels'])

subj_obj_dataset = pd.concat([subj_corpus_df, subj_labels_df], axis=1)
subj_obj_dataset

Unnamed: 0,text,labels
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1
3,a light-hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1
...,...,...
9995,"in the end , they discover that balance in lif...",0
9996,a counterfeit 1000 tomin bank note is passed i...,0
9997,enter the beautiful and mysterious secret agen...,0
9998,after listening to a missionary from china spe...,0


In [58]:
# saving built dataset
subj_obj_dataset.to_csv(rootdir+'subj_obj_dataset.csv')

In [59]:
import numpy as np
from nltk.tokenize import word_tokenize


seq_len = np.array([len(word_tokenize(sent)) for sent in subj_obj_dataset['text']])
print([(p, np.percentile(seq_len, p)) for p in [65, 75, 80, 85, 90, 95, 99, 100]])

[(65, 27.0), (75, 30.0), (80, 32.0), (85, 35.0), (90, 38.0), (95, 43.0), (99, 56.0), (100, 122.0)]


In [60]:
avg_length = seq_len.mean()
avg_length

24.6031

[Bert Embedding](https://www.youtube.com/watch?v=zJW57aCBCTk)

[Fine-Tuning Bert](https://www.youtube.com/watch?v=x66kkDnbzi4)

### Unused

In [None]:
def arrange_subjectivity(subjective_sents : List[Tuple[List[str], str]]) -> pd.DataFrame:
  '''Arrange subjectivity dataset into a table with format (index, text, label)
    Params:
    ------
      subjective_sents : list(tuple(list(str),str))
        subjectivity dataset
    Returns:
    ------
      A dataframe type resembling a table organized as follow : (insed, sentence text, label)
  '''

  sents = []
  labels = []
  counter =0
  for words, label in subjective_sents:
    sents.append(' '.join([w for w in words]))
    labels.append(label)
    
  df_sents = pd.DataFrame(sents, columns=['text'])
  df_labels = pd.DataFrame(labels, columns=["tag"])
  df = pd.concat([df_sents, df_labels], axis=1)

  return df

In [None]:
docs = subj_docs + obj_docs
subj_obj_dataset = arrange_subjectivity(docs)
subj_obj_dataset

Unnamed: 0,text,tag
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj
...,...,...
9995,"in the end , they discover that balance in lif...",obj
9996,a counterfeit 1000 tomin bank note is passed i...,obj
9997,enter the beautiful and mysterious secret agen...,obj
9998,after listening to a missionary from china spe...,obj
