<a href="https://colab.research.google.com/github/MatteoGuglielmi-tech/Polarity-and-Subjectivity-Detection/blob/main/pre-processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MTL and BERT Embedding

In [None]:
from nltk.sentiment.util import mark_negation
from typing import List, Dict

def negative_marking(doc : List[str]) -> List[str]:
    '''
        Params :
        -----------------
            doc : list[str]
                document where each element is a list of strings
        Returns :
            negated_doc : list[str]
                document after having applied double negation
    '''

    flat_doc = [w for sent in doc for w in sent]
    negated_doc = mark_negation(flat_doc, double_neg_flip=True)

    return " ".join([w for w in negated_doc])

In [None]:
from textblob.en import subjectivity
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import subjectivity

nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('subjectivity')


mr = movie_reviews
sub = subjectivity
neg = mr.paras(categories='neg')
pos = mr.paras(categories='pos')


subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')]

print(len(neg), len(pos))
print(len(subj_docs), len(obj_docs))

In [None]:
new_corpus = [negative_marking(d) for d in pos] + [negative_marking(d) for d in neg]

In [None]:
import pandas as pd

data = pd.DataFrame(new_corpus, columns=['text'])

In [None]:
print(data)

In [None]:
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob as tb



nltk.download('stopwords')

## https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python
# since re are largely used for this type of applications, the regex module is used
# re.sub(pattern, repl, string, count=0, flags=0)

def pre_processing(text : str) -> str :
  '''Clear text from numbers, stop words (very common words), punctuation and 
    correct possible misspelled words

    Params :
    --------
      cw : List[str]
        list of words in the sentence to be cleaned
    Returns :
    ---------
      list of cleaned words
  '''

  stop_list = set(stopwords.words("english"))
  text = list(text.lower().split())
  text = ' '.join([word for word in text if word not in stop_list])
  # remove http links
  text = re.sub(r'http\S+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w*', '', text)
  # Remove whitespace (including new line characters)
  text = re.sub(r'\s\s+', '', text)
  # Remove single space remaining at the front of the tweet.
  text = text.lstrip(' ') 
  # Remove @username
  text = re.sub('@[^\s]+','', text)
  text = list(text.translate(str.maketrans('', '', string.punctuation)).split())
  # correction of possible miss-click
  text = ' '.join([str(tb(word).correct()) for word in text])

  return text



pre_processing('it is a beautiful lif, https://github @matthew')

In [None]:
data['text'] = data['text'].apply(pre_processing)
print(data)

In [None]:
data

In [None]:
labels = [[1,0]] * len(data['text']//2) + [[0,1]] * len(data['text']//2)
len(labels)

In [None]:
df = pd.DataFrame(labels, columns=['pos', 'neg'])

In [None]:
df

In [None]:
complete_data = pd.concat([data, df], axis=1)