<a href="https://colab.research.google.com/github/LuFernandez/neural-networks/blob/master/naive_bayesV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naive Bayes
## Redes Neuronales

© 2020 Lucero G. Fernandez



In [0]:
%load_ext autoreload
%autoreload 2
from IPython.display import clear_output
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Carga de datos


In [0]:
! wget "https://raw.githubusercontent.com/rn-2019-itba/Clase-2--Hiperparametros-y-Tecnicas-de-Validacion/master/Opcional/data/emails.csv"
clear_output()

In [0]:
dataset = pd.read_csv('emails.csv')

Se tienen 5728 emails de dos tipos: text y spam. En text están las palabras y en spam si es o no spam (1==spam)

In [0]:
#for i in range(len(dataset['spam'])):
#  if dataset.spam[i]==1:
 #   print(dataset.spam[i])

 #la cantidad de emails que son spam
 spam_count=np.sum(dataset.spam)

Probabilidad a priori de que sea spam:

In [5]:
print(spam_count/len(dataset)*100)

23.88268156424581


#Preprocesamiento de datos


In [0]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [0]:
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
stemmer = PorterStemmer()
clear_output()

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB


##\#1 tokenization, lemmatization, stop-words, stemming

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  stop = [x for x in lem if x not in stopwords.words('english')] #stop words
  stem = [stemmer.stem(x) for x in stop] #stemming
  alpha = [x for x in stem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))




Guardo en disco

In [0]:
import pickle

with open('em_filt.pck', 'wb') as fp:
    pickle.dump(emails_filtered, fp)

In [0]:
with open ('em_filt.pck', 'rb') as fp:
    itemlist = pickle.load(fp)

###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100, 250]
alpha=[.001, .01, .1, .5, 1.0, 2.0]


chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo,min_df=minimo)
    raw_data = tfidf_vect.fit_transform(itemlist)

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      #print(porc)
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [13]:
print("Con tokenization, lemmatization, stop-words y stemming")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, lemmatization, stop-words y stemming
con TfidVectorizer
alpha= 0.01
max_df= 0.7
min_df= 10

Rendimiento= 98.44840961986036


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [15]:
print("Con tokenization, lemmatization, stop-words y stemming")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, lemmatization, stop-words y stemming
con CountVectorizer
alpha= 0.1
max_df= 0.75
min_df= 10

Rendimiento= 98.56477889837083



##\#2 tokenization, stop-words, stemming

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  #lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  stop = [x for x in tok if x not in stopwords.words('english')] #stop words
  stem = [stemmer.stem(x) for x in stop] #stemming
  alpha = [x for x in stem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))




###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1, 2]


chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo,min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      #print(porc)
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [20]:
print("Con tokenization, stop-words y stemming")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, stop-words y stemming
con TfidVectorizer
alpha= 0.1
max_df= 0.75
min_df= 30

Rendimiento= 98.09930178432894


###CountVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [22]:
print("Con tokenization, stop-words y stemming")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, stop-words y stemming
con CountVectorizer
alpha= 0.1
max_df= 0.5
min_df= 10

Rendimiento= 98.48719937936384



##\#3 tokenization, lemmatization, stemming

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  #stop = [x for x in lem if x not in stopwords.words('english')] #stop words
  stem = [stemmer.stem(x) for x in lem] #stemming
  alpha = [x for x in stem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo, min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [12]:
print("Con tokenization, stop-words y stemming")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, lemmatization y stemming
con TfidVectorizer
alpha= 0.1
max_df= 0.85
min_df= 10

Rendimiento= 98.21567106283942


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [15]:
print("Con tokenization, stop-words y stemming")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, lemmatization y stemming
con CountVectorizer
alpha= 0.1
max_df= 0.8
min_df= 30

Rendimiento= 98.7199379363848



##\#4 tokenization, lemmatization, y stop-words

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  stop = [x for x in lem if x not in stopwords.words('english')] #stop words
  #stem = [stemmer.stem(x) for x in stop] #stemming
  alpha = [x for x in stop if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo, min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [18]:
print("Con tokenization,lemmatization, y stop-words")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, stop-words y stemming
con TfidVectorizer
alpha= 0.1
max_df= 0.5
min_df= 10

Rendimiento= 98.33204034134988


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [20]:
print("Con tokenization,lemmatization, y stop-words")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, stop-words y stemming
con CountVectorizer
alpha= 0.1
max_df= 0.85
min_df= 20

Rendimiento= 98.52598913886735



##\#5 tokenization y lemmatization

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  #stop = [x for x in lem if x not in stopwords.words('english')] #stop words
  #stem = [stemmer.stem(x) for x in stop] #stemming
  alpha = [x for x in lem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo, min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [23]:
print("Con tokenization, y lemmatization")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y lemmatization
con TfidVectorizer
alpha= 0.1
max_df= 0.8
min_df= 20

Rendimiento= 98.37083010085338


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [25]:
print("Con tokenization, y lemmatization")
print("con Count Vectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y lemmatization
con TfidVectorizer
alpha= 0.01
max_df= 0.5
min_df= 30

Rendimiento= 98.79751745539178



##\#6 tokenization y stop-words

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  #lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  stop = [x for x in tok if x not in stopwords.words('english')] #stop words
  #stem = [stemmer.stem(x) for x in stop] #stemming
  alpha = [x for x in stop if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo, min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [28]:
print("Con tokenization, y stop-words")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y stop-words
con TfidVectorizer
alpha= 0.1
max_df= 0.9
min_df= 10

Rendimiento= 98.13809154383243


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [30]:
print("Con tokenization, y stop-words")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y stop-words
con TfidVectorizer
alpha= 0.1
max_df= 0.5
min_df= 10

Rendimiento= 98.40961986035687



##\#7 tokenization y stemming

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  #lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  #stop = [x for x in lem if x not in stopwords.words('english')] #stop words
  stem = [stemmer.stem(x) for x in tok] #stemming
  alpha = [x for x in stem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


###TfidVectorizer


In [0]:
max_df=[.5, .6, .7, .75, .8, .85, .9]
min_df=[10, 20, 30, 40, 50, 75, 100]
alpha=[.001, .01, .1, .5, 1.0, 2.0]
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #TfidVectorizer
    tfidf_vect = TfidfVectorizer(max_df=maximo, min_df=minimo)
    raw_data = tfidf_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [33]:
print("Con tokenization, y stemming")
print("con TfidVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y stemming
con TfidVectorizer
alpha= 0.1
max_df= 0.6
min_df= 20

Rendimiento= 98.2544608223429


###CountVectorizer


In [0]:
chosen_alpha=0
chosen_maxdf=0
chosen_mindf=0
maximum_porc=0
for minimo in min_df:
  for maximo in max_df:

    #Count Vectorizer

    count_vect = CountVectorizer(max_df=maximo, min_df=minimo)
    raw_data=count_vect.fit_transform(emails_filtered)
    raw_data.toarray()

    #Separo en train, validation, y test
    X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
    X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

    #entreno modelo
    for alfa in alpha:
      clf = MultinomialNB(alpha=alfa)
      clf.fit(X_train, y_train)
      #Analizo la precisión del modelo, con X_val e y_val
      porc=sum(np.array(clf.predict(X_val.toarray()))==np.array(y_val))/len(y_val)*100
      if porc>maximum_porc:
        maximum_porc=porc  #guardo valores
        chosen_alpha=alfa
        chosen_maxdf=maximo
        chosen_mindf=minimo

In [35]:
print("Con tokenization, y stemming")
print("con CountVectorizer")
print("alpha=", chosen_alpha)
print("max_df=", chosen_maxdf)
print("min_df=", chosen_mindf)
print("")
print("Rendimiento=", maximum_porc)

Con tokenization, y stemming
con TfidVectorizer
alpha= 0.1
max_df= 0.9
min_df= 10

Rendimiento= 98.7199379363848


#Resultados y modelo elegido


Con todo lo analizado anteriormente, se procede a copiar los resultados obtenidos y elegir el modelo para utilizar sobre X_test e y_test

*   Con tokenization, lemmatization, stop-words y stemming
  *   con TfidVectorizer: alpha= 0.01, max_df= 0.7, min_df= 10
      *   Rendimiento= 98.44840961986036
  *   con CountVectorizer: alpha= 0.1, max_df= 0.75, min_df= 10
      *   Rendimiento= 98.56477889837083

*   Con tokenization, stop-words y stemming
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.75, min_df= 30
      *   Rendimiento= 98.09930178432894
  *   con CountVectorizer: alpha= 0.1, max_df= 0.5, min_df= 10
      *   Rendimiento= 98.48719937936384

*   Con tokenization, lemmatization y stemming
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.85, min_df= 10
      *   Rendimiento= 98.21567106283942
  *   con CountVectorizer: alpha= 0.1, max_df= 0.8, min_df= 30
      *   Rendimiento= 98.7199379363848

*   Con tokenization, stop-words y stemming
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.5, min_df= 10
      *   Rendimiento= 98.33204034134988
  *   con CountVectorizer: alpha= 0.1, max_df= 0.85, min_df= 20
      *   Rendimiento= 98.52598913886735

*   Con tokenization, y lemmatization
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.8, min_df= 20
      *   Rendimiento= 98.37083010085338
  *   con CountVectorizer: alpha= 0.01, max_df= 0.5, min_df= 30
      *   Rendimiento= 98.79751745539178

*   Con tokenization, y stop-words
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.9, min_df= 10
      *   Rendimiento= 98.13809154383243
  *   con CountVectorizer: alpha= 0.1, max_df= 0.9, min_df= 10
      *   Rendimiento= 98.40961986035687

*   Con tokenization, y stemming
  *   con TfidVectorizer: alpha= 0.1, max_df= 0.6, min_df= 20
      *   Rendimiento= 98.2544608223429
  *   con CountVectorizer: alpha= 0.1, max_df= 0.9, min_df= 10
      *   Rendimiento= 98.7199379363848

##Test sobre el modelo elegido



Se decidió utilizar el modelo con el que se obtuvo el mayor porcentaje, este fue: 

Con tokenization, y lemmatization, countVectorizer y los siguientes parámetros:

alpha= 0.01, max_df= 0.5, min_df= 30.

Cabe aclarar que se esperaba que los resultados fueran mejores al usar TfidVectorizer que CountVectorizer, pero los resultados obtenidos fueron opuestos, es decir, se obtuvieron mejores con CountVectorizer.

In [0]:
emails_raw=dataset.text
emails_filtered=list()

for n in range(len(dataset)):
  tok = word_tokenize(emails_raw[n])  #tokenization
  lem = [lemmatizer.lemmatize(x,pos = 'v') for x in tok] #lemmatization
  alpha = [x for x in lem if x.isalpha()]  #filter non words

  emails_filtered.append(" ".join(alpha))


In [42]:
#Count Vectorizer

count_vect = CountVectorizer(max_df=.5, min_df=30)
raw_data=count_vect.fit_transform(emails_filtered)
raw_data.toarray()

#Separo en train, validation, y test
X_train, X_testprima, y_train, y_testprima = train_test_split(raw_data, dataset['spam'], test_size=0.90)
X_val, X_test, y_val, y_test = train_test_split(X_testprima, y_testprima, test_size=0.5)

#entreno modelo
clf = MultinomialNB(alpha=0.01)
clf.fit(X_train, y_train)
#Analizo la precisión del modelo, con X_test e y_test
porc=sum(np.array(clf.predict(X_test.toarray()))==np.array(y_test))/len(y_test)*100
f'El porcentaje de emails clasificados correctamente es de {porc}%'

'El porcentaje de emails clasificados correctamente es de 97.59503491078355%'

Como era esperado, al usar el modelo en los datos reservados para test, el porcentaje de emails clasificados es menor a cuando se usa sobre los datos de validación (98.7% vs 97.6%).