In [1]:
from time import time

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20
batch_size = 128
init = "nndsvda"

In [2]:
def plot_top_words(model, feature_names, n_top_words, title):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx +1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [3]:
bunch = fetch_20newsgroups()
len(bunch.target_names)

20

In [4]:
bunch.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
data, y = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)

In [6]:
len(data)

11314

In [7]:
data[:2]

["Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n",
 "\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap

In [8]:
y[:2]

array([17,  0])

In [74]:
import re

original_string = "ñáéqa2e/? eqwefdgsgERERdf i'm \n\n\nnot"

def normalize_string(st):
    val = re.sub(r"n't", "n ", st)
    val = re.sub(r"'m", " ", val)
    val = re.sub(r"'t", " ", val)
    val = re.sub(r"_", " ", val)
    # SUBSTITUTE NON WORD BY WHITESPACE
    val = re.sub(r"\W", " ", val)
    # SUBSTITUE ONE OR MORE DIGITS BY WHITESPACE
    val = re.sub(r"\d+", " ", val)
    # SUBSTITUTE ONE OR MORE WHITESPACES BY ONE WHITESPACE
    val = re.sub(r"\s+", " ", val)
    # SUBSTITUTE ACCENTS
    val = re.sub(r"[áâàä]", "a", val)
    val = re.sub(r"[éêèë]", "e", val)
    val = re.sub(r"[íîìï]", "i", val)
    val = re.sub(r"[óôòö]", "o", val)
    val = re.sub(r"[úûùü]", "u", val)    
    
    return val

normalize_string(original_string)

'ñaeqa e eqwefdgsgERERdf i not'

In [75]:
import pandas as pd

df = pd.DataFrame({"text": data, "cat": y})
df['normalized_text'] = (df.text.str.lower()                         
                         .map(lambda text: normalize_string(text))
                         .str.strip())

In [76]:
df.head(2)

Unnamed: 0,text,cat,normalized_text
0,Well i'm not sure about the story nad it did s...,17,well i not sure about the story nad it did see...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",0,yeah do you expect people to read the faq etc ...


In [77]:
df.normalized_text.sample(3).to_list()

['hey i wasn the one dancing and singing on jan now was i i was roundly ridiculed for my predictions sure they were easy tell that to the other of the people just wait until the see what clinton has planned for their pension funds this one doesn take much thinking either uncle sam needs money bad and pension funds got it well they used to have it turns out the states have been plundering state employee funds for the past years',
 'commodore epson homewriter pin printer d s disk drive joysticks mouse lotsa software both games and apps rapid fire joystick adapter about a year old obo',
 'i looking for a kawasaki zx engine just the engine no intake exhaust ignition etc preferably in the central texas area but we haven had much luck around here so we ll take whatever we can get please reply via mail or call if you have one or more really need a spare thanx']

# Vocabulario

In [126]:
vocabulary = {}

for text in df.normalized_text:
    token_list = text.split()
    for token in token_list:
        if token in vocabulary:
            vocabulary[token] += 1
        else:
            vocabulary[token] = 1    

In [127]:
vocab_series = pd.Series(vocabulary).sort_values(ascending=False)

In [128]:
vocab_series.shape

(72026,)

In [129]:
vocab_series.head(30).index.to_list()

['the',
 'ax',
 'to',
 'a',
 'of',
 'and',
 'i',
 'in',
 'is',
 'that',
 'it',
 'for',
 'you',
 's',
 'on',
 'this',
 'be',
 'are',
 'have',
 'not',
 'with',
 'as',
 'm',
 'or',
 'x',
 'but',
 'was',
 'if',
 'they',
 'from']

In [130]:
corpus_stopwords = [
    'ax', 'x', 'q', 'w', 'f', 'g', 'p', 'r', 'b', 'u', 'v', 'would', 'c', 'e',
    'n', 'l', 'k', 'z', 'get', 'also', 'h', 'j'
]

In [131]:
import nltk

generic_stopwords = nltk.corpus.stopwords.words('english')
generic_stopwords = set([normalize_string(word) for word in generic_stopwords])
generic_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'aren ',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'couldn ',
 'd',
 'did',
 'didn',
 'didn ',
 'do',
 'does',
 'doesn',
 'doesn ',
 'doing',
 'don',
 'don ',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'hadn ',
 'has',
 'hasn',
 'hasn ',
 'have',
 'haven',
 'haven ',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'isn ',
 'it',
 'it s',
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'mightn ',
 'more',
 'most',
 'mustn',
 'mustn ',
 'my',
 'myself',
 'needn',
 'needn ',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 '

In [132]:
stop_words = generic_stopwords.union(corpus_stopwords)
len(stop_words)

201

In [133]:
def remove_stopwords_from_text(text, stopwords_set):
    val = ' '.join([word for word in text.split() 
                    if word not in stopwords_set])
    return val

In [134]:
df['clean_normalized_text'] = \
df.normalized_text.map(lambda text: 
                       remove_stopwords_from_text(text=text, 
                                                  stopwords_set=stop_words))
df = df.loc[df.clean_normalized_text.str.len().gt(140)]

In [135]:
df.shape

(8427, 4)

In [136]:
df.clean_normalized_text.sample(3).to_list()

['want create single line text widget entering small amount text want fixed width horizontal scrollbar scrolls automatically user types order keep insertion point visible trying two problems addition horizontal scrollbar make text widget taller instead seems cover part text scrollbar scroll automatically user types text order keep insertion point visible help appreciated mike',
 'sorry waste bandwidth anyone know software mail order company called software unlimited ordered software charged credit card never send package call many times nobody answer phone check computer shoppers found advertise anymore know still business know contact please tell thank much',
 'heavy traffic slow bit mostly buffer zone front balance minimal buffer behind often find jerk behind notice traffic moving faster lanes switch one pass fine keep better eye jerk behind looking ahead rather front splitting attention ahead mirrors pretty damned complicated make back motion hand arm second third time even braindea

## Lematización

- esperaba -> esperar
- horas -> hora
- estuviéseis -> estar
- ...


"buenos dias que tal esperaba que pudiera atenderme"

LEMATIZAMOS:

"buen dia que tal esperar que poder atender"


`pip install spacy`

después descargamos el modelo en inglés más pequeño

`python -m spacy download en_core_web_sm`

In [138]:
import spacy

nlp = spacy.load('en_core_web_sm')