In [1]:
import pandas as pd
import spacy
import re
import stanza
import deplacy

## Parte 1

In [2]:
file_route = './Sample Texts - Hoja 1.csv'

In [3]:
dataframe = pd.read_csv(file_route)
dataframe.columns

Index(['Text', 'Type', 'Source'], dtype='object')

In [4]:
tweets_corpus = dataframe['Text'][dataframe['Type'] == 'Tweet'].tolist()
review_corpus = dataframe['Text'][dataframe['Type'] != 'Tweet'].tolist()

In [5]:
review_dict = {
    'notCharacter': r'[“” ,.¿?¡!#$%/():&;^-]|\b\'(?!\w)|(?<!\w)\'\b',
}

twitter_dict = {
  'user': r'\B@\w+',
  'hashtag': r'\B#\w+',
  'links': r'\b(?:https?|ftp):\/\/\S+',
  'especialWords': r'\b[A-Z]{2,3}\b|&amp;',
  'notCharacter': r'[“”",.¿?¡!#$%/():&;^-]|\b\'(?!\w)|(?<!\w)\'\b',
}

## Preprocesamiento
- Eliminación expresiones
- Tokenización
- Lematización

### Eliminación expresiones

In [6]:
clean_tweets = []
for review in tweets_corpus:
  aux = re.sub(twitter_dict['user'] , '', review)
  aux = re.sub(twitter_dict['hashtag'] , '', aux)
  aux = re.sub(twitter_dict['links'], '', aux)
  aux = re.sub(twitter_dict['especialWords'], ' ', aux)
  aux = re.sub(twitter_dict['notCharacter'], ' ', aux)
  clean_tweets.append(aux)

In [7]:
clean_reviews = []
for review in review_corpus:
  aux = re.sub(review_dict['notCharacter'] , ' ', review)
  clean_reviews.append(aux)

## Tokenización y Lematización

In [8]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

2023-11-27 19:50:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-27 19:50:57 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2023-11-27 19:50:57 INFO: Using device: cpu
2023-11-27 19:50:57 INFO: Loading: tokenize
2023-11-27 19:50:57 INFO: Loading: pos
2023-11-27 19:50:57 INFO: Loading: lemma
2023-11-27 19:50:57 INFO: Done loading processors!


In [9]:
final_tweets = []
for tweet in clean_tweets:
  doc = nlp(tweet)
  aux = ''
  for sentence in doc.sentences:
    for word in sentence.words:
      aux += word.lemma + ' '
  final_tweets.append(aux)

In [10]:
final_reviews = []
for review in clean_reviews:
  doc = nlp(review)
  aux = ''
  for sentence in doc.sentences:
    for word in sentence.words:
      aux += word.lemma + ' '
  final_reviews.append(aux)

# Parte 2

In [11]:
nlp = spacy.load("en_core_web_trf")

In [12]:
def renderSyntacticTree(sentence: str)-> None:
  doc = nlp(sentence)
  deplacy.render(doc)

In [13]:
sentence1 = final_tweets[0]
print(sentence1)

as a woman you should not complain about clean up your house as a man you should always take the trash out 


In [14]:
renderSyntacticTree(sentence1)

as       ADP  ═══╗<══════════╗   prep
a        DET  <╗ ║           ║   det
woman    NOUN ═╝<╝           ║   pobj
you      PRON <════════════╗ ║   nsubj
should   AUX  <══════════╗ ║ ║   aux
not      PART <════════╗ ║ ║ ║   neg
complain VERB ═══════╗═╝═╝═╝═╝<╗ ccomp
about    ADP  ═════╗<╝         ║ prep
clean    VERB ═╗═╗<╝           ║ pcomp
up       ADP  <╝ ║             ║ prt
your     PRON <╗ ║             ║ poss
house    NOUN ═╝<╝             ║ dobj
as       ADP  ═══╗<════════╗   ║ prep
a        DET  <╗ ║         ║   ║ det
man      NOUN ═╝<╝         ║   ║ pobj
you      PRON <══════════╗ ║   ║ nsubj
should   AUX  <════════╗ ║ ║   ║ aux
always   ADV  <══════╗ ║ ║ ║   ║ advmod
take     VERB ═══╗═╗═╝═╝═╝═╝═══╝ ROOT
the      DET  <╗ ║ ║             det
trash    NOUN ═╝ ║<╝             dobj
out      ADP  <══╝               prt


In [15]:
sentence2 = final_tweets[6]
print(sentence2)

cause the reply to be disregard and the tap notification under the keyboard be opened😡😡 😡 


In [16]:
renderSyntacticTree(sentence2)

cause        SCONJ <════════╗         mark
the          DET   <╗       ║         det
reply        NOUN  ═╝<════╗ ║         nsubj
to           PART  <════╗ ║ ║         aux
be           AUX   ═╗═╗═╝═╝═╝═╗═╗═╗═╗ ROOT
disregard    VERB  <╝ ║       ║ ║ ║ ║ attr
and          CCONJ <══╝       ║ ║ ║ ║ cc
the          DET   <══════╗   ║ ║ ║ ║ det
tap          NOUN  <╗     ║   ║ ║ ║ ║ compound
notification NOUN  ═╝═══╗═╝<╗ ║ ║ ║ ║ nsubjpass
under        ADP   ═══╗<╝   ║ ║ ║ ║ ║ prep
the          DET   <╗ ║     ║ ║ ║ ║ ║ det
keyboard     NOUN  ═╝<╝     ║ ║ ║ ║ ║ pobj
be           AUX   <╗       ║ ║ ║ ║ ║ auxpass
opened       VERB  ═╝═══════╝<╝ ║ ║ ║ conj
😡           PUNCT <════════════╝ ║ ║ punct
😡           PUNCT <══════════════╝ ║ punct
😡           PUNCT <════════════════╝ punct


In [17]:
sentence3 = final_tweets[12]
print(sentence3)

yep I have try laptop too several time over the past week and again today I have try different browser too 


In [18]:
renderSyntacticTree(sentence3)

yep       INTJ  <══════════════╗   intj
I         PRON  <════════════╗ ║   nsubj
have      VERB  ═╗═╗═╗═╗═╗═╗═╝═╝═╗ ROOT
try       VERB  <╝ ║ ║ ║ ║ ║     ║ dep
laptop    NOUN  <══╝ ║ ║ ║ ║     ║ dobj
too       ADV   <════╝ ║ ║ ║     ║ advmod
several   ADJ   <╗     ║ ║ ║     ║ amod
time      NOUN  ═╝<════╝ ║ ║     ║ npadvmod
over      ADP   ═════╗<══╝ ║     ║ prep
the       DET   <══╗ ║     ║     ║ det
past      ADJ   <╗ ║ ║     ║     ║ amod
week      NOUN  ═╝═╝<╝     ║     ║ pobj
and       CCONJ <══════════╝     ║ cc
again     ADV   <════════════╗   ║ advmod
today     NOUN  <══════════╗ ║   ║ npadvmod
I         PRON  <════════╗ ║ ║   ║ nsubj
have      VERB  ═════╗═╗═╝═╝═╝<══╝ conj
try       VERB  <══╗ ║ ║           det
different ADJ   <╗ ║ ║ ║           amod
browser   NOUN  ═╝═╝<╝ ║           dobj
too       ADV   <══════╝           advmod


In [19]:
sentence4 = final_reviews[0]
print(sentence4)

great cd my lovely Pat have one of the great voice of her generation I have listen to this cd for year and I still love it when I be in a good mood it make I feel good a bad mood just evaporate like sugar in the rain this cd just ooze life vocal be jusat stuunning and lyric just kill one of life 's hidden gem this be a desert isle cd in my book why she never make it big be just beyond I everytime I play this no matter black white young old male female everybody say one thing " who be that sing " 


In [20]:
clean_reviews[0]

'Great CD  My lovely Pat has one of the GREAT voices of her generation  I have listened to this CD for YEARS and I still LOVE IT  When I\'m in a good mood it makes me feel better  A bad mood just evaporates like sugar in the rain  This CD just oozes LIFE  Vocals are jusat STUUNNING and lyrics just kill  One of life\'s hidden gems  This is a desert isle CD in my book  Why she never made it big is just beyond me  Everytime I play this  no matter black  white  young  old  male  female EVERYBODY says one thing "Who was that singing  "'

In [21]:
renderSyntacticTree(sentence4)

great      ADJ   <══════════════════╗                                 amod
cd         NOUN  ═════╗═══════════╗═╝<════════════════════════╗       dep
my         PRON  <══╗ ║           ║                           ║       poss
lovely     ADJ   <╗ ║ ║           ║                           ║       amod
Pat        PROPN ═╝═╝<╝           ║                           ║       npadvmod
have       AUX   ═══════════════╗<╝                           ║       meta
one        NUM   ═════════════╗<╝                             ║       dobj
of         ADP   ═══════════╗<╝                               ║       prep
the        DET   <════════╗ ║                                 ║       det
great      ADJ   <══════╗ ║ ║                                 ║       amod
voice      NOUN  ═════╗═╝═╝<╝                                 ║       pobj
of         ADP   ═══╗<╝                                       ║       prep
her        PRON  <╗ ║                                         ║       poss
generation NOUN  ═╝<╝  

In [22]:
sentence5 = clean_reviews[7]
print(sentence5)

Challenges in natural language processing frequently involve speech recognition  natural language understanding  and natural language generation \n


In [23]:
renderSyntacticTree(sentence5)

Challenges    NOUN  ═══════╗<════╗   nsubj
in            ADP   ═════╗<╝     ║   prep
natural       ADJ   <╗   ║       ║   amod
language      NOUN  ═╝<╗ ║       ║   compound
processing    NOUN  ═══╝<╝       ║   pobj
frequently    ADV   <══════════╗ ║   advmod
involve       VERB  ═══╗═══╗═╗═╝═╝═╗ ROOT
speech        NOUN  <╗ ║   ║ ║     ║ compound
recognition   NOUN  ═╝<╝   ║ ║     ║ dobj
              SPACE ═════╗<╝ ║     ║ dep
natural       ADJ   <╗   ║   ║     ║ amod
language      NOUN  ═╝<╗ ║   ║     ║ compound
understanding NOUN  ═══╝<╝   ║     ║ dobj
              SPACE ═╗═══╗<══╝     ║ dep
and           CCONJ <╝   ║         ║ cc
natural       ADJ   <╗   ║         ║ amod
language      NOUN  ═╝<╗ ║         ║ compound
generation    NOUN  ═══╝<╝         ║ conj
\n            PUNCT <══════════════╝ punct
