# Naive Bayes

In [58]:
import os
import random
import shutil
import io
import numpy
import pandas as pd
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def copy(name):
    originalfolder = "data/emails/emails/"+name
    copy_ham = "data/emails/train/"+name
    copy_test = "data/emails/test/"+name  # Carpeta para el 20% restante

    for carpeta in [copy_ham, copy_test]:
        if not os.path.exists(carpeta):
            os.makedirs(carpeta)
    if not os.path.exists(originalfolder):
        raise FileNotFoundError(f"La carpeta de origen no existe: {originalfolder}")

    archivos = [f for f in os.listdir(originalfolder) if os.path.isfile(os.path.join(originalfolder, f))]

    if len(archivos) == 0:
        raise ValueError("No hay archivos en la carpeta de origen.")

    random.shuffle(archivos)
    cantidad_train = int(len(archivos) * 0.8)

    archivos_train = archivos[:cantidad_train]
    archivos_test = archivos[cantidad_train:]

    for archivo in archivos_train:
        shutil.copy2(os.path.join(originalfolder, archivo), os.path.join(copy_ham, archivo))

    for archivo in archivos_test:
        shutil.copy2(os.path.join(originalfolder, archivo), os.path.join(copy_test, archivo))
copy("ham")
copy("spam")

In [59]:

#Retorna la ruta completa del archivo y su contenido (sin la cabecera)
def readFiles(path):
    #os.walk itera sobre todos los archivos de una carpeta
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                #se salta todo el encabezado y empieza a leer en el 
                #primera linea vacía
                elif line == '\n':
                    inBody = True
            f.close()
            #Join all items in a list into a string, using a \n as separator
            message = '\n'.join(lines)
            yield path, message

def dataFrameFromDirectory_test(path):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message})
        index.append(filename)

    return DataFrame(rows, index=index)

def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

#Un DataFrame con dos columnas, uno contiene el contenido del correo y el otro el tipo (spam o ham) 
data = DataFrame({'message': [], 'class': []})

#data = data.append(dataFrameFromDirectory('./datos/emails/spam', 'spam'))
#data = data.append(dataFrameFromDirectory('./datos/emails/ham', 'ham'))

# Un DataFrame con dos columnas, uno contiene el contenido del correo y el otro el tipo (spam o ham)
data_train_spam = dataFrameFromDirectory('data/emails/train/spam', 'spam')
data_train_ham = dataFrameFromDirectory('data/emails/train/ham', 'ham')
data_train = pd.concat([data_train_spam, data_train_ham], ignore_index=True)


In [60]:
data_train.head()

Unnamed: 0,message,class
0,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",spam
1,ATTENTION: This is a MUST for ALL Computer Use...,spam
2,This is a multi-part message in MIME format.\n...,spam
3,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,spam
4,This is the bottom line. If you can GIVE AWAY...,spam


In [61]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data_train['message'].values)

print('\ncounts')
print(type(counts))
counts


counts
<class 'scipy.sparse._csr.csr_matrix'>


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 431104 stored elements and shape (3000, 63860)>

In [62]:
targets = data_train['class'].values

print('\ntargets')
print(type(targets))
print("Shape:", targets.shape)  
print("Dimensions:", targets.ndim)  


targets
<class 'numpy.ndarray'>
Shape: (3000,)
Dimensions: 1


In [63]:
classifier = MultinomialNB()
classifier.fit(counts, targets)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


Probamos con algunos ejemplos:

In [64]:
data_test_spam = dataFrameFromDirectory_test('data/emails/test/spam')
data_test_ham = dataFrameFromDirectory_test('data/emails/test/ham')
data = pd.concat([data_test_spam, data_test_ham], ignore_index=True)

prediction = []  # asegúrate de que esté inicializada
for idx, row in data.iterrows():
    message = row['message']
    example_counts = vectorizer.transform([message])
    pred = classifier.predict(example_counts)[0]
    prediction.append((pred, message))  # guarda ambos valores

print("\nPredicciones realizadas:")
for pred, msg in prediction:
    print(f"{pred} -- msg: \"{msg[:50]}\"")



Predicciones realizadas:
spam -- msg: "Dear Homeowner,

 

Interest Rates are at their lo"
spam -- msg: "; Thu, 19 Sep 2002 11:30:56 +0100

    (may be for"
spam -- msg: "=20



=20



=20



=20



=20



Call today for "
ham -- msg: "This is the bottom line.  If you can GIVE AWAY CD'"
spam -- msg: "------=_NextPart_000_00B8_51E06B6A.C8586B31

Conte"
spam -- msg: "<HR>

<html>

<head>

  <title>Secured Investement"
spam -- msg: "<table width="600" border="20" align="center" bord"
spam -- msg: "Unlist Information



		This message is brought to"
spam -- msg: "Dear Consumers, Increase your Business Sales! 



"
ham -- msg: "; Tue, 24 Sep 2002 15:15:57 +0100

    webnote.net"
ham -- msg: "UNLIMITED WEB CON=

FERENCING



Subscribe to the "
spam -- msg: "------=_NextPart_000_00A3_65E24E1C.A3468E63



Q09"
spam -- msg: "=20



=20



=20



=20



=20



=20



        "
ham -- msg: "; Thu, 22 Aug 2002 16:34:25 +0100

    (8.9.3/8.9."
spam -- msg: "ABIDJAN, IVORY COAST  

WEST-AFRICA.  



Actividades:
- Probar correos spam nuevos y ver que resultado arroja
- Hacer train/test para probar el modelo