# Lab 3 - Text Processing
Natural Language Processing - Universidad Tecnológica Nacional

### Pre-processing with scikit-learn

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
#Creating a word vectorizer
vectorizer = CountVectorizer(min_df=1)

In [53]:
#Transforming a list of strings to a bag of words and vectorizing them
content = ["How to format my hard disk", "Hard disk format problems"]
X = vectorizer.fit_transform(content)

In [54]:
bow = vectorizer.get_feature_names_out()
ocurrences_vectors = X.toarray()

print(f'Bag of words:\n{bow}')
print(f"\nOcurrences' vectors of each word:\n{ocurrences_vectors}")

Bag of words:
['disk' 'format' 'hard' 'how' 'my' 'problems' 'to']

Ocurrences' vectors of each word:
[[1 1 1 1 1 0 1]
 [1 1 1 0 0 1 0]]


In [55]:
print(f"Ocurrences of the word 'hard in the second document: {ocurrences_vectors[1,2]}")

Ocurrences of the word 'hard in the second document: 1


#### Let's try with a real documents collection





In [56]:
from sklearn.datasets import fetch_20newsgroups

In [57]:
#Selecting the categories
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 
'sci.med']

In [58]:
#Querying the trainig dataset by categories
twenty_train = fetch_20newsgroups(subset='train', categories=categories, 
shuffle=True, random_state=42) 

In [59]:
#Vectorizing the training data
vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(twenty_train.data)

In [60]:
freq_algorithm = vectorizer.vocabulary_.get('algorithm')
print(f"Frequency of the word 'algorithm': {freq_algorithm}")

Frequency of the word 'algorithm': 4690


In [61]:
#Amount of tokens extracted
num_tokens = len(vectorizer.get_feature_names_out())
print(f"Amount of tokens extracted: {num_tokens}")

Amount of tokens extracted: 35788


In [62]:
#Creating a word vectorizer without stop words
vectorizer = CountVectorizer(stop_words = 'english')

### Pre-processing with NLTK

In [63]:
import nltk

#### Stemming

In [64]:
#Creating a stemmer
stemmer = nltk.stem.SnowballStemmer('english')

In [65]:
print(f"Stem of 'cats': {stemmer.stem('cats')}")
print(f"Stem of 'loving': {stemmer.stem('loving')}")

Stem of 'cats': cat
Stem of 'loving': love


### Using scikit-learn CountVectorizer with NLTK stemmer

In [66]:
vectorizer = CountVectorizer(stop_words = 'english')

In [67]:
text_to_analyze = "John bought carrots and potatoes"

#### Defining a build analyzer without a stemmer

In [68]:
#Definig a build analyzer
analyze = vectorizer.build_analyzer()

In [69]:
print(f"'{text_to_analyze}' analyzed without stemming:\n {analyze(text_to_analyze)}")

'John bought carrots and potatoes' analyzed without stemming:
 ['john', 'bought', 'carrots', 'potatoes']


#### Defining a build analyzer with a stemmer

In [70]:
#Creating an english stemmer
stemmer = nltk.stem.SnowballStemmer('english')

In [71]:
#Defining an stemmer build analyzer class
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [72]:
#Creating an instance of the class before defined
stem_vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')

In [73]:
#Defining a build analyzer with stemming
stem_analyze = stem_vectorizer.build_analyzer()

In [74]:
print(f"'{text_to_analyze}' analyzed with stemming:\n {stem_analyze(text_to_analyze)}")

'John bought carrots and potatoes' analyzed with stemming:
 ['john', 'bought', 'carrot', 'potato']


#### Comparing them

Comparing the results on 'John bought carrots and potatoes'

In [75]:
print(f"'{text_to_analyze}' analyzed without stemming:\n {analyze(text_to_analyze)}\n")
print(f"'{text_to_analyze}' analyzed with stemming:\n {stem_analyze(text_to_analyze)}")

'John bought carrots and potatoes' analyzed without stemming:
 ['john', 'bought', 'carrots', 'potatoes']

'John bought carrots and potatoes' analyzed with stemming:
 ['john', 'bought', 'carrot', 'potato']


Looking results using **20_Newsgroups**' dataset

In [76]:
#Amount of tokens extracted using stemming
train_counts = stem_vectorizer.fit_transform(twenty_train.data) 
print(f"Number of words extracted with stemming: {len(stem_vectorizer.get_feature_names_out())}")

Number of words extracted with stemming: 26888


In [77]:
#Amount of tokens extracted not using stemming
train_counts = vectorizer.fit_transform(twenty_train.data) 
print(f"Number of words extracted with stemming: {len(vectorizer.get_feature_names_out())}")

Number of words extracted with stemming: 35482


### Spanish implementation

In [81]:
#Importing a spanish corpus
from nltk.corpus import cess_esp
print(cess_esp.sents())

[['El', 'grupo', 'estatal', 'Electricité_de_France', '-Fpa-', 'EDF', '-Fpt-', 'anunció', 'hoy', ',', 'jueves', ',', 'la', 'compra', 'del', '51_por_ciento', 'de', 'la', 'empresa', 'mexicana', 'Electricidad_Águila_de_Altamira', '-Fpa-', 'EAA', '-Fpt-', ',', 'creada', 'por', 'el', 'japonés', 'Mitsubishi_Corporation', 'para', 'poner_en_marcha', 'una', 'central', 'de', 'gas', 'de', '495', 'megavatios', '.'], ['Una', 'portavoz', 'de', 'EDF', 'explicó', 'a', 'EFE', 'que', 'el', 'proyecto', 'para', 'la', 'construcción', 'de', 'Altamira_2', ',', 'al', 'norte', 'de', 'Tampico', ',', 'prevé', 'la', 'utilización', 'de', 'gas', 'natural', 'como', 'combustible', 'principal', 'en', 'una', 'central', 'de', 'ciclo', 'combinado', 'que', 'debe', 'empezar', 'a', 'funcionar', 'en', 'mayo_del_2002', '.'], ...]


In [93]:
#Importing spanish stopwords
from nltk.corpus import stopwords

stopwords_esp = stopwords.words('spanish')
print(stopwords_esp)

['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'no', 'una', 'su', 'al', 'lo', 'como', 'más', 'pero', 'sus', 'le', 'ya', 'o', 'este', 'sí', 'porque', 'esta', 'entre', 'cuando', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'hay', 'donde', 'quien', 'desde', 'todo', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas', 'estoy', 'estás', 'está', 'estamos', 'estáis', 'están', 'e

In [82]:
#Initializing a spanish stemmer
spanish_stemmer = nltk.stem.SnowballStemmer('spanish')

In [None]:
spanish_CountVectorizer = CountVectorizer(min_df=1, stop_words=stopwords_esp)

In [83]:
#Defining a spanish vectorizer with stemmer class
class SpanishStemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(SpanishStemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([spanish_stemmer.stem(w) for w in analyzer(doc)])

In [94]:
#Creating an instance of the class before defined
spanish_stem_vectorizer = SpanishStemmedCountVectorizer(min_df=1, stop_words=stopwords_esp)

#Instantiating the spanish build analyzer
spanish_stem_analyze = spanish_stem_vectorizer.build_analyzer()

In [114]:
#Vectorizing the first and second sentence of the corpus
first_sentence = ' '.join(cess_esp.sents()[0])
second_sentence = ' '.join(cess_esp.sents()[1])

print(f"'{first_sentence}' analyzed with a stemmer:\n{spanish_stem_analyze(first_sentence)}\n")
print(f"'{second_sentence}' analyzed without a stemmer:\n{spanish_stem_analyze(second_sentence)}\n")

'El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del 51_por_ciento de la empresa mexicana Electricidad_Águila_de_Altamira -Fpa- EAA -Fpt- , creada por el japonés Mitsubishi_Corporation para poner_en_marcha una central de gas de 495 megavatios .' analyzed with a stemmer:
['grup', 'estatal', 'electricite_de_franc', 'fpa', 'edf', 'fpt', 'anunc', 'hoy', 'juev', 'compr', '51_por_cient', 'empres', 'mexican', 'electricidad_aguila_de_altamir', 'fpa', 'eaa', 'fpt', 'cre', 'japones', 'mitsubishi_corporation', 'poner_en_march', 'central', 'gas', '495', 'megavati']

'Una portavoz de EDF explicó a EFE que el proyecto para la construcción de Altamira_2 , al norte de Tampico , prevé la utilización de gas natural como combustible principal en una central de ciclo combinado que debe empezar a funcionar en mayo_del_2002 .' analyzed without a stemmer:
['portavoz', 'edf', 'explic', 'efe', 'proyect', 'construccion', 'altamira_2', 'nort', 'tampic', 'prev', 'utiliz', '