# Task 2

In [12]:
import pandas as pd
import re
from collections import Counter

# Cargar archivo
df = pd.read_csv("entrenamiento.txt", sep="\t", header=None, names=["label", "text"])

df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


1. Pre-Procesamiento

In [13]:
#Limpieza datos
def clean_text(text):
    # pasar a minúsculas
    text = text.lower()
    
    # eliminar puntuación y caracteres especiales
    text = re.sub(r'[^a-z\s]', '', text)
    
    # eliminar espacios extra
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Aplicar limpieza
df['clean_text'] = df['text'].apply(clean_text)

df.head()

Unnamed: 0,label,text,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [14]:
#Generación vocabulario
# Unir todo el texto limpio
all_words = " ".join(df['clean_text'])

# Separar en palabras
words = all_words.split()

# Crear vocabulario (palabras únicas)
vocabulario = sorted(set(words))

# Tamaño del vocabulario
print("Tamaño del vocabulario:", len(vocabulario))

# Ver algunas palabras
vocabulario[:20]


Tamaño del vocabulario: 8569


['a',
 'aa',
 'aah',
 'aaooooright',
 'aathilove',
 'aathiwhere',
 'ab',
 'abbey',
 'abdomen',
 'abeg',
 'abelu',
 'aberdeen',
 'abi',
 'ability',
 'abiola',
 'abj',
 'able',
 'abnormally',
 'about',
 'aboutas']

2. Entrenamiento

In [15]:
#Calcule las probabilidades a priori (Priors): P(Spam) y P(Ham)
# Total de mensajes
N = len(df)

# Conteo por clase
n_spam = len(df[df['label'] == 'spam'])
n_ham = len(df[df['label'] == 'ham'])

# Priors
P_spam = n_spam / N
P_ham = n_ham / N

print("P(Spam):", P_spam)
print("P(Ham):", P_ham)


P(Spam): 0.1342800647132842
P(Ham): 0.8657199352867158


In [16]:
#Calcule los Likelihoods con Laplace Smoothing (k=1) para cada palabra en el vocabulario
spam_texts = df[df['label'] == 'spam']['clean_text']
ham_texts = df[df['label'] == 'ham']['clean_text']

# Unir textos
spam_words = " ".join(spam_texts).split()
ham_words = " ".join(ham_texts).split()

# Contadores
spam_word_counts = Counter(spam_words)
ham_word_counts = Counter(ham_words)

# Total de palabras por clase
total_spam_words = len(spam_words)
total_ham_words = len(ham_words)

V = len(vocabulario)

#calcular Likelihoods
likelihoods = pd.DataFrame(index=vocabulario)
likelihoods['P(w|Spam)'] = 0.0
likelihoods['P(w|Ham)'] = 0.0

for word in vocabulario:
    likelihoods.loc[word, 'P(w|Spam)'] = (
        spam_word_counts.get(word, 0) + 1
    ) / (total_spam_words + V)

    likelihoods.loc[word, 'P(w|Ham)'] = (
        ham_word_counts.get(word, 0) + 1
    ) / (total_ham_words + V)

likelihoods.head()

Unnamed: 0,P(w|Spam),P(w|Ham)
a,0.015606,0.014068
aa,4.1e-05,2.7e-05
aah,4.1e-05,5.4e-05
aaooooright,4.1e-05,2.7e-05
aathilove,4.1e-05,5.4e-05
