In [19]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder
from gensim.models.phrases import Phrases, Phraser
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('reuters')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import reuters

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\joela\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\joela\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [20]:
documents = [reuters.raw(fileid).lower() for fileid in reuters.fileids()]

# Ver el primer documento
print(documents[0])

asian exporters fear damage from u.s.-japan rift
  mounting trade friction between the
  u.s. and japan has raised fears among many of asia's exporting
  nations that the row could inflict far-reaching economic
  damage, businessmen and officials said.
      they told reuter correspondents in asian capitals a u.s.
  move against japan might boost protectionist sentiment in the
  u.s. and lead to curbs on american imports of their products.
      but some exporters said that while the conflict would hurt
  them in the long-run, in the short-term tokyo's loss might be
  their gain.
      the u.s. has said it will impose 300 mln dlrs of tariffs on
  imports of japanese electronics goods on april 17, in
  retaliation for japan's alleged failure to stick to a pact not
  to sell semiconductors on world markets at below cost.
      unofficial japanese estimates put the impact of the tariffs
  at 10 billion dlrs and spokesmen for major electronics firms
  said they would virtually halt exports

In [21]:
tokens = [word for doc in documents for word in word_tokenize(doc)]
print(tokens[:15])

['asian', 'exporters', 'fear', 'damage', 'from', 'u.s.-japan', 'rift', 'mounting', 'trade', 'friction', 'between', 'the', 'u.s.', 'and', 'japan']


In [22]:
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
finder.apply_freq_filter(10)
bigramas = finder.nbest(bigram_measures.pmi, n=50)

print(bigramas)

[('het', 'comite'), ('lago', 'agrio'), ('dar', 'es'), ('es', 'salaam'), ('hoare', 'govett'), ('corpus', 'christi'), ('paz', 'estenssoro'), ('corazon', 'aquino'), ('ay', 'expd-e'), ('lear', 'siegler'), ('l.f.', 'rothschild'), ('ranks', 'hovis'), ('abu', 'dhabi'), ('poison', 'pill'), ('hajime', 'tamura'), ('kleinwort', 'benson'), ('ind', 'ttl-f'), ('rjr', 'nabisco'), ('gates', 'learjet'), ('pro', 'forma'), ('margaret', 'thatcher'), ('carter', 'hawley'), ('canary', 'islands'), ('bra', 'kanon'), ('lord', 'abbett'), ('mcdonnell', 'douglas'), ('puerto', 'rico'), ('phelps', 'dodge'), ("'n", 'pak'), ('sao', 'paulo'), ('brace', 'jovanovich'), ('karl', 'otto'), ('marlin', 'fitzwater'), ('pizza', 'inn'), ('dean', 'witter'), ('buenos', 'aires'), ('costa', 'rica'), ('del', 'este'), ('king', 'fahd'), ('arturo', 'hernandez'), ('hernandez', 'grisanti'), ('pl', '480'), ('punta', 'del'), ('el', 'nino'), ('optional', 'origin'), ('du', 'pont'), ('drexel', 'burnham'), ('denis', 'bra'), ('hisham', 'nazer'),

In [23]:
sentences = [word_tokenize(sent) for sent in sent_tokenize("\n".join(documents).lower())]
sentences = [sent for sent in sentences if len(sent) > 1]

collocations = Phrases(sentences=sentences, min_count=10, threshold=0.5, scoring='npmi')
to_collocations = Phraser(collocations)

sent = 'new york is in united states of america. south africa and south america are in different continents'
print(to_collocations[word_tokenize(sent)])

['new_york', 'is', 'in', 'united_states', 'of', 'america', '.', 'south_africa', 'and', 'south', 'america', 'are', 'in', 'different', 'continents']


In [24]:
# Crear el objeto BigramCollocationFinder
collocations = BigramCollocationFinder.from_words(tokens)

# Usar BigramAssocMeasures para obtener las puntuaciones de los bigramas
scored = collocations.score_ngrams(BigramAssocMeasures().pmi)

# Crear un DataFrame con los bigramas y sus puntuaciones
df_collocations = pd.DataFrame(scored, columns=["bigram", "score"])

# Eliminar duplicados y ordenar por puntuación
df_collocations = df_collocations.drop_duplicates().sort_values(by="score", ascending=False)

# Imprimir los primeros 50 bigramas
print(df_collocations.head(50))

                                 bigram      score
0                   (+bahia, superior+)  20.560248
558                    (justed, adoped)  20.560248
548              (invoice, documenting)  20.560248
549                         (irv, goss)  20.560248
550                    (ismail, salleh)  20.560248
551                   (jaap, klootwijk)  20.560248
552                (jaimie, villalobos)  20.560248
553                     (jamal, hassan)  20.560248
554                    (jeoffrey, budd)  20.560248
555                      (jesse, helms)  20.560248
556                      (joerg, kastl)  20.560248
557                 (julien, geertsema)  20.560248
559                   (kamal, kharrazi)  20.560248
546            (inlcude, 7,507,000-dlr)  20.560248
560                  (karim, al-azzawi)  20.560248
561                 (karsten, mahlmann)  20.560248
562                (katsuhiko, okiyama)  20.560248
563                (katsuyuki, okayasu)  20.560248
564                       (kaya

In [None]:
to_collocations.save('bigram_model')

['C:\\Users\\joela/nltk_data', 'c:\\Users\\joela\\Documentos\\WorkSpace\\Universidad\\Semestre 5\\14567-Web Avanzada\\taller Minar textos\\MineriaDeTextos\\env\\nltk_data', 'c:\\Users\\joela\\Documentos\\WorkSpace\\Universidad\\Semestre 5\\14567-Web Avanzada\\taller Minar textos\\MineriaDeTextos\\env\\share\\nltk_data', 'c:\\Users\\joela\\Documentos\\WorkSpace\\Universidad\\Semestre 5\\14567-Web Avanzada\\taller Minar textos\\MineriaDeTextos\\env\\lib\\nltk_data', 'C:\\Users\\joela\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data']
