In [1]:
import pandas as pd
import spacy
import re
import stanza
import deplacy

## Parte 1

In [2]:
file_route = './Sample Texts - Hoja 1.csv'

In [3]:
dataframe = pd.read_csv(file_route)
dataframe.columns

Index(['Text', 'Type', 'Source'], dtype='object')

In [4]:
tweets_corpus = dataframe['Text'][dataframe['Type'] == 'Tweet'].tolist()
review_corpus = dataframe['Text'][dataframe['Type'] != 'Tweet'].tolist()

In [5]:
review_dict = {
    'notCharacter': r'[‚Äú‚Äù ,.¬ø?¬°!#$%/():&;^-]|\b\'(?!\w)|(?<!\w)\'\b',
}

twitter_dict = {
  'user': r'\B@\w+',
  'hashtag': r'\B#\w+',
  'links': r'\b(?:https?|ftp):\/\/\S+',
  'especialWords': r'\b[A-Z]{2,3}\b|&amp;',
  'notCharacter': r'[‚Äú‚Äù",.¬ø?¬°!#$%/():&;^-]|\b\'(?!\w)|(?<!\w)\'\b',
}

## Preprocesamiento
- Eliminaci√≥n expresiones
- Tokenizaci√≥n
- Lematizaci√≥n

### Eliminaci√≥n expresiones

In [6]:
clean_tweets = []
for review in tweets_corpus:
  aux = re.sub(twitter_dict['user'] , '', review)
  aux = re.sub(twitter_dict['hashtag'] , '', aux)
  aux = re.sub(twitter_dict['links'], '', aux)
  aux = re.sub(twitter_dict['especialWords'], ' ', aux)
  aux = re.sub(twitter_dict['notCharacter'], ' ', aux)
  clean_tweets.append(aux)

In [7]:
clean_reviews = []
for review in review_corpus:
  aux = re.sub(review_dict['notCharacter'] , ' ', review)
  clean_reviews.append(aux)

## Tokenizaci√≥n y Lematizaci√≥n

In [8]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

2023-11-27 19:50:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   ‚Ä¶

2023-11-27 19:50:57 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2023-11-27 19:50:57 INFO: Using device: cpu
2023-11-27 19:50:57 INFO: Loading: tokenize
2023-11-27 19:50:57 INFO: Loading: pos
2023-11-27 19:50:57 INFO: Loading: lemma
2023-11-27 19:50:57 INFO: Done loading processors!


In [9]:
final_tweets = []
for tweet in clean_tweets:
  doc = nlp(tweet)
  aux = ''
  for sentence in doc.sentences:
    for word in sentence.words:
      aux += word.lemma + ' '
  final_tweets.append(aux)

In [10]:
final_reviews = []
for review in clean_reviews:
  doc = nlp(review)
  aux = ''
  for sentence in doc.sentences:
    for word in sentence.words:
      aux += word.lemma + ' '
  final_reviews.append(aux)

# Parte 2

In [11]:
nlp = spacy.load("en_core_web_trf")

In [12]:
def renderSyntacticTree(sentence: str)-> None:
  doc = nlp(sentence)
  deplacy.render(doc)

In [13]:
sentence1 = final_tweets[0]
print(sentence1)

as a woman you should not complain about clean up your house as a man you should always take the trash out 


In [14]:
renderSyntacticTree(sentence1)

as       ADP  ‚ïê‚ïê‚ïê‚ïó<‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó   prep
a        DET  <‚ïó ‚ïë           ‚ïë   det
woman    NOUN ‚ïê‚ïù<‚ïù           ‚ïë   pobj
you      PRON <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë   nsubj
should   AUX  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë   aux
not      PART <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë ‚ïë   neg
complain VERB ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó‚ïê‚ïù‚ïê‚ïù‚ïê‚ïù‚ïê‚ïù<‚ïó ccomp
about    ADP  ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù         ‚ïë prep
clean    VERB ‚ïê‚ïó‚ïê‚ïó<‚ïù           ‚ïë pcomp
up       ADP  <‚ïù ‚ïë             ‚ïë prt
your     PRON <‚ïó ‚ïë             ‚ïë poss
house    NOUN ‚ïê‚ïù<‚ïù             ‚ïë dobj
as       ADP  ‚ïê‚ïê‚ïê‚ïó<‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó   ‚ïë prep
a        DET  <‚ïó ‚ïë         ‚ïë   ‚ïë det
man      NOUN ‚ïê‚ïù<‚ïù         ‚ïë   ‚ïë pobj
you      PRON <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë   ‚ïë nsubj
should   AUX  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë   ‚ïë aux
always   ADV  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë ‚ïë   ‚ïë a

In [15]:
sentence2 = final_tweets[6]
print(sentence2)

cause the reply to be disregard and the tap notification under the keyboard be openedüò°üò° üò° 


In [16]:
renderSyntacticTree(sentence2)

cause        SCONJ <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó         mark
the          DET   <‚ïó       ‚ïë         det
reply        NOUN  ‚ïê‚ïù<‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë         nsubj
to           PART  <‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë         aux
be           AUX   ‚ïê‚ïó‚ïê‚ïó‚ïê‚ïù‚ïê‚ïù‚ïê‚ïù‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó ROOT
disregard    VERB  <‚ïù ‚ïë       ‚ïë ‚ïë ‚ïë ‚ïë attr
and          CCONJ <‚ïê‚ïê‚ïù       ‚ïë ‚ïë ‚ïë ‚ïë cc
the          DET   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó   ‚ïë ‚ïë ‚ïë ‚ïë det
tap          NOUN  <‚ïó     ‚ïë   ‚ïë ‚ïë ‚ïë ‚ïë compound
notification NOUN  ‚ïê‚ïù‚ïê‚ïê‚ïê‚ïó‚ïê‚ïù<‚ïó ‚ïë ‚ïë ‚ïë ‚ïë nsubjpass
under        ADP   ‚ïê‚ïê‚ïê‚ïó<‚ïù   ‚ïë ‚ïë ‚ïë ‚ïë ‚ïë prep
the          DET   <‚ïó ‚ïë     ‚ïë ‚ïë ‚ïë ‚ïë ‚ïë det
keyboard     NOUN  ‚ïê‚ïù<‚ïù     ‚ïë ‚ïë ‚ïë ‚ïë ‚ïë pobj
be           AUX   <‚ïó       ‚ïë ‚ïë ‚ïë ‚ïë ‚ïë auxpass
opened       VERB  ‚ïê‚ïù‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù<‚ïù ‚ïë ‚ïë ‚ïë conj
üò°           PUNCT <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù ‚ïë ‚ïë punct
üò°    

In [17]:
sentence3 = final_tweets[12]
print(sentence3)

yep I have try laptop too several time over the past week and again today I have try different browser too 


In [18]:
renderSyntacticTree(sentence3)

yep       INTJ  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó   intj
I         PRON  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë   nsubj
have      VERB  ‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó‚ïê‚ïó‚ïê‚ïù‚ïê‚ïù‚ïê‚ïó ROOT
try       VERB  <‚ïù ‚ïë ‚ïë ‚ïë ‚ïë ‚ïë     ‚ïë dep
laptop    NOUN  <‚ïê‚ïê‚ïù ‚ïë ‚ïë ‚ïë ‚ïë     ‚ïë dobj
too       ADV   <‚ïê‚ïê‚ïê‚ïê‚ïù ‚ïë ‚ïë ‚ïë     ‚ïë advmod
several   ADJ   <‚ïó     ‚ïë ‚ïë ‚ïë     ‚ïë amod
time      NOUN  ‚ïê‚ïù<‚ïê‚ïê‚ïê‚ïê‚ïù ‚ïë ‚ïë     ‚ïë npadvmod
over      ADP   ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïê‚ïê‚ïù ‚ïë     ‚ïë prep
the       DET   <‚ïê‚ïê‚ïó ‚ïë     ‚ïë     ‚ïë det
past      ADJ   <‚ïó ‚ïë ‚ïë     ‚ïë     ‚ïë amod
week      NOUN  ‚ïê‚ïù‚ïê‚ïù<‚ïù     ‚ïë     ‚ïë pobj
and       CCONJ <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù     ‚ïë cc
again     ADV   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó   ‚ïë advmod
today     NOUN  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë   ‚ïë npadvmod
I         PRON  <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë ‚ïë   ‚ïë nsubj
have      VERB  ‚ï

In [19]:
sentence4 = final_reviews[0]
print(sentence4)

great cd my lovely Pat have one of the great voice of her generation I have listen to this cd for year and I still love it when I be in a good mood it make I feel good a bad mood just evaporate like sugar in the rain this cd just ooze life vocal be jusat stuunning and lyric just kill one of life 's hidden gem this be a desert isle cd in my book why she never make it big be just beyond I everytime I play this no matter black white young old male female everybody say one thing " who be that sing " 


In [20]:
clean_reviews[0]

'Great CD  My lovely Pat has one of the GREAT voices of her generation  I have listened to this CD for YEARS and I still LOVE IT  When I\'m in a good mood it makes me feel better  A bad mood just evaporates like sugar in the rain  This CD just oozes LIFE  Vocals are jusat STUUNNING and lyrics just kill  One of life\'s hidden gems  This is a desert isle CD in my book  Why she never made it big is just beyond me  Everytime I play this  no matter black  white  young  old  male  female EVERYBODY says one thing "Who was that singing  "'

In [21]:
renderSyntacticTree(sentence4)

great      ADJ   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó                                 amod
cd         NOUN  ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó‚ïê‚ïù<‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó       dep
my         PRON  <‚ïê‚ïê‚ïó ‚ïë           ‚ïë                           ‚ïë       poss
lovely     ADJ   <‚ïó ‚ïë ‚ïë           ‚ïë                           ‚ïë       amod
Pat        PROPN ‚ïê‚ïù‚ïê‚ïù<‚ïù           ‚ïë                           ‚ïë       npadvmod
have       AUX   ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù                           ‚ïë       meta
one        NUM   ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù                             ‚ïë       dobj
of         ADP   ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù                               ‚ïë       prep
the        DET   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë                                 ‚ïë       det
great      ADJ   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚

In [22]:
sentence5 = clean_reviews[7]
print(sentence5)

Challenges in natural language processing frequently involve speech recognition  natural language understanding  and natural language generation \n


In [23]:
renderSyntacticTree(sentence5)

Challenges    NOUN  ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïê‚ïê‚ïê‚ïê‚ïó   nsubj
in            ADP   ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù     ‚ïë   prep
natural       ADJ   <‚ïó   ‚ïë       ‚ïë   amod
language      NOUN  ‚ïê‚ïù<‚ïó ‚ïë       ‚ïë   compound
processing    NOUN  ‚ïê‚ïê‚ïê‚ïù<‚ïù       ‚ïë   pobj
frequently    ADV   <‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó ‚ïë   advmod
involve       VERB  ‚ïê‚ïê‚ïê‚ïó‚ïê‚ïê‚ïê‚ïó‚ïê‚ïó‚ïê‚ïù‚ïê‚ïù‚ïê‚ïó ROOT
speech        NOUN  <‚ïó ‚ïë   ‚ïë ‚ïë     ‚ïë compound
recognition   NOUN  ‚ïê‚ïù<‚ïù   ‚ïë ‚ïë     ‚ïë dobj
              SPACE ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó<‚ïù ‚ïë     ‚ïë dep
natural       ADJ   <‚ïó   ‚ïë   ‚ïë     ‚ïë amod
language      NOUN  ‚ïê‚ïù<‚ïó ‚ïë   ‚ïë     ‚ïë compound
understanding NOUN  ‚ïê‚ïê‚ïê‚ïù<‚ïù   ‚ïë     ‚ïë dobj
              SPACE ‚ïê‚ïó‚ïê‚ïê‚ïê‚ïó<‚ïê‚ïê‚ïù     ‚ïë dep
and           CCONJ <‚ïù   ‚ïë         ‚ïë cc
natural       ADJ   <‚ïó   ‚ïë         ‚ïë amod
language      NOUN  ‚ïê‚ïù<‚ïó ‚ïë         ‚ïë compound
generation    NOUN  ‚ïê‚ïê‚ïê