<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/src/MyModel/pre-processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 16111409337325406474
 xla_global_id: -1]

In [None]:
!cat /proc/cpuinfo

In [None]:
!cat /proc/meminfo

# MTL and BERT Embedding

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
rootdir = '/content/gdrive/MyDrive/Colab Notebooks/Polarity-Subjectivity-Detection/'

In [3]:
from nltk.sentiment.util import mark_negation
from typing import List, Dict, Tuple

def negative_marking(doc : List[str]) -> List[str]:
    '''
        Params :
        -----------------
            doc : list[str]
                document where each element is a list of strings
        Returns :
            negated_doc : list[str]
                document after having applied double negation
    '''

    flat_doc = [w for sent in doc for w in sent]
    negated_doc = mark_negation(flat_doc, double_neg_flip=True)

    return " ".join([w for w in negated_doc])

In [4]:
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob as tb


## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
# since re are largely used for this type of applications, the regex module is used
# re.sub(pattern, repl, string, count=0, flags=0)

def pre_processing(text : str) -> str :
  '''Clear text from numbers, stop words (very common words), punctuation and 
    correct possible misspelled words

    Params :
    --------
      cw : List[str]
        list of words in the sentence to be cleaned
    Returns :
    ---------
      list of cleaned words
  '''

  stop_list = set(stopwords.words("english"))
  text = list(text.lower().split())
  text = ' '.join([word for word in text if word not in stop_list])
  # remove http links
  text = re.sub(r'http\S+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w*', '', text)
  # Remove whitespace (including new line characters)
  text = re.sub(r'\s\s+', '', text)
  # Remove single space remaining at the front of the tweet.
  text = text.lstrip(' ') 
  # Remove @username
  text = re.sub('@[^\s]+','', text)
  text = list(text.translate(str.maketrans('', '', string.punctuation)).split())
  # correction of possible miss-click
  #text = ' '.join([str(tb(word).correct()) for word in text])
  text = ' '.join([str(word) for word in text])

  return text

In [5]:
import nltk

In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('subjectivity')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package subjectivity to /root/nltk_data...
[nltk_data]   Unzipping corpora/subjectivity.zip.


True

In [7]:
from textblob.en import subjectivity
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity


mr = movie_reviews
sub = subjectivity
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')


subj_docs = [sent for sent in subjectivity.sents(categories='subj')]
obj_docs = [sent for sent in subjectivity.sents(categories='obj')]

print(len(neg), len(pos))
print(len(subj_docs), len(obj_docs))

1000 1000
5000 5000


### Polarity Dataset Analysis

In [8]:
def lol2str(doc):
    # flatten & join
    return " ".join([w for sent in doc for w in sent])

In [9]:
#new_corpus = [negative_marking(d) for d in pos] + [negative_marking(d) for d in neg]
new_corpus = [lol2str(d) for d in neg] + [lol2str(d) for d in pos]

In [10]:
import pandas as pd

data = pd.DataFrame(new_corpus, columns=['text'])

In [11]:
data

Unnamed: 0,text
0,"plot : two teen couples go to a church party ,..."
1,the happy bastard ' s quick movie review damn ...
2,it is movies like these that make a jaded movi...
3,""" quest for camelot "" is warner bros . ' first..."
4,synopsis : a mentally unstable man undergoing ...
...,...
1995,wow ! what a movie . it ' s everything a movie...
1996,"richard gere can be a commanding actor , but h..."
1997,"glory -- starring matthew broderick , denzel w..."
1998,steven spielberg ' s second epic film on world...


In [12]:
pre_processing('it is a beautiful lif, https://github @matthew')

'beautiful lif'

In [13]:
data['text'] = data['text'].apply(pre_processing)

In [14]:
data

Unnamed: 0,text
0,plot two teen couples go church party drink dr...
1,happy bastard quick movie review damn y2k bug ...
2,movies like make jaded movie viewer thankful i...
3,quest camelot warner bros first feature length...
4,synopsis mentally unstable man undergoing psyc...
...,...
1995,wow movie everything movie funny dramatic inte...
1996,richard gere commanding actor always great fil...
1997,glory starring matthew broderick denzel washin...
1998,steven spielberg second epic film world war ii...


In [15]:
labels = [[1,0]] * (len(data['text'])//2) + [[0,1]] * (len(data['text'])//2)
len(labels)

2000

In [16]:
labels = [1 if idx<(len(data['text'])//2) else 0 for idx in range(len(data['text']))]

In [17]:
len(labels[0:1000])

1000

In [18]:
len(labels[1000:])

1000

In [19]:
df = pd.DataFrame(labels, columns=['labels'])

In [20]:
complete_data = pd.concat([data, df], axis=1)

In [21]:
complete_data

Unnamed: 0,text,labels
0,plot two teen couples go church party drink dr...,1
1,happy bastard quick movie review damn y2k bug ...,1
2,movies like make jaded movie viewer thankful i...,1
3,quest camelot warner bros first feature length...,1
4,synopsis mentally unstable man undergoing psyc...,1
...,...,...
1995,wow movie everything movie funny dramatic inte...,0
1996,richard gere commanding actor always great fil...,0
1997,glory starring matthew broderick denzel washin...,0
1998,steven spielberg second epic film world war ii...,0


In [22]:
# saving built dataset
complete_data.to_csv(rootdir+'Datasets/movie_rews.csv')
data.to_csv(rootdir+'Datasets/movie_rews_clean.csv')
new_corp_df  = pd.DataFrame(new_corpus, columns=['text'])
new_corp_df.to_csv(rootdir+'Datasets/movie_rews_raw.csv')

This to show that $99\%$ of the sentences are under $947.14$ words.

In [23]:
import numpy as np
from nltk.tokenize import word_tokenize


seq_len = np.array([len(word_tokenize(sent)) for sent in complete_data['text']])
print([(p, np.percentile(seq_len, p)) for p in [65, 75, 80, 85, 90, 95, 99, 100]])

[(65, 384.0), (75, 429.0), (80, 457.0), (85, 490.0), (90, 539.1000000000001), (95, 644.05), (99, 833.01), (100, 1394.0)]


In [24]:
avg_length = seq_len.mean()
avg_length

354.423

### Subjectivity Dataset Analysis

In [25]:
subj_corpus = [" ".join(sent) for sent in subj_docs] + [" ".join(sent) for sent in obj_docs]
subj_labels = np.array([1] * len(subj_docs) + [0] * len(obj_docs))

In [26]:
subj_corpus_df = pd.DataFrame(subj_corpus, columns=['text'])
subj_labels_df = pd.DataFrame(subj_labels, columns=['labels'])

subj_obj_dataset = pd.concat([subj_corpus_df, subj_labels_df], axis=1)
subj_obj_dataset 

Unnamed: 0,text,labels
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1
3,a light-hearted french film about the spiritua...,1
4,my wife is an actress has its moments in looki...,1
...,...,...
9995,"in the end , they discover that balance in lif...",0
9996,a counterfeit 1000 tomin bank note is passed i...,0
9997,enter the beautiful and mysterious secret agen...,0
9998,after listening to a missionary from china spe...,0


In [27]:
subj_corpus_df

Unnamed: 0,text
0,"smart and alert , thirteen conversations about..."
1,"color , musical bounce and warm seas lapping o..."
2,it is not a mass-market entertainment but an u...
3,a light-hearted french film about the spiritua...
4,my wife is an actress has its moments in looki...
...,...
9995,"in the end , they discover that balance in lif..."
9996,a counterfeit 1000 tomin bank note is passed i...
9997,enter the beautiful and mysterious secret agen...
9998,after listening to a missionary from china spe...


In [28]:
# saving built dataset
subj_obj_dataset.to_csv(rootdir+'Datasets/subj_obj_dataset.csv')
subj_corpus_df.to_csv(rootdir+'Datasets/subj_obj_dataset_clean.csv')

In [29]:
import numpy as np
from nltk.tokenize import word_tokenize


#seq_len = np.array([len(word_tokenize(sent)) for sent in subj_obj_dataset['text']])
seq_len = np.array([len(word_tokenize(sent)) for sent in subj_corpus_df['text']])
print([(p, np.percentile(seq_len, p)) for p in [65, 75, 80, 85, 90, 95, 99, 100]])

[(65, 27.0), (75, 30.0), (80, 32.0), (85, 35.0), (90, 38.0), (95, 43.0), (99, 56.0), (100, 122.0)]


In [30]:
avg_length = seq_len.mean()
avg_length

24.6031

[Bert Embedding](https://www.youtube.com/watch?v=zJW57aCBCTk)

[Fine-Tuning Bert](https://www.youtube.com/watch?v=x66kkDnbzi4)

### Unused

In [None]:
def arrange_subjectivity(subjective_sents : List[Tuple[List[str], str]]) -> pd.DataFrame:
  '''Arrange subjectivity dataset into a table with format (index, text, label)
    Params:
    ------
      subjective_sents : list(tuple(list(str),str))
        subjectivity dataset
    Returns:
    ------
      A dataframe type resembling a table organized as follow : (insed, sentence text, label)
  '''

  sents = []
  labels = []
  counter =0
  for words, label in subjective_sents:
    sents.append(' '.join([w for w in words]))
    labels.append(label)
    
  df_sents = pd.DataFrame(sents, columns=['text'])
  df_labels = pd.DataFrame(labels, columns=["tag"])
  df = pd.concat([df_sents, df_labels], axis=1)

  return df

In [None]:
docs = subj_docs + obj_docs
subj_obj_dataset = arrange_subjectivity(docs)
subj_obj_dataset

Unnamed: 0,text,tag
0,"smart and alert , thirteen conversations about...",subj
1,"color , musical bounce and warm seas lapping o...",subj
2,it is not a mass-market entertainment but an u...,subj
3,a light-hearted french film about the spiritua...,subj
4,my wife is an actress has its moments in looki...,subj
...,...,...
9995,"in the end , they discover that balance in lif...",obj
9996,a counterfeit 1000 tomin bank note is passed i...,obj
9997,enter the beautiful and mysterious secret agen...,obj
9998,after listening to a missionary from china spe...,obj
