In [10]:
import nltk
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') 
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/chris/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [26]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', names=["label", "text"])
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
print("Columnas:", df.columns)
print("\nValores nulos por columna:\n", df.isna().sum())
print("\nClases:\n", df['label'].value_counts())


Columnas: Index(['label', 'text'], dtype='object')

Valores nulos por columna:
 label    0
text     0
dtype: int64

Clases:
 label
0    4825
1     747
Name: count, dtype: int64


In [28]:
df['length'] = df['text'].str.len()
df[['text','length']].head()


Unnamed: 0,text,length
0,"Go until jurong point, crazy.. Available only ...",111
1,Ok lar... Joking wif u oni...,29
2,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,U dun say so early hor... U c already then say...,49
4,"Nah I don't think he goes to usf, he lives aro...",61


In [29]:
def clean_text(text):
    # 1. Minúsculas
    text = text.lower()
    # 2. Eliminar todo lo que no sea letras o espacios
    text = re.sub(r"[^a-z\s]", " ", text)
    # 3. Tokenización
    tokens = nltk.word_tokenize(text)
    # 4. Eliminar stopwords y palabras cortas
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    # 5. Reconstrucción
    return " ".join(tokens)

# Aplicar al dataset
df['clean_text'] = df['text'].astype(str).apply(clean_text)

# Comparar original vs limpio
df[['text','clean_text']].head(10)


Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis great wo...
1,Ok lar... Joking wif u oni...,ok lar joking wif oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,U dun say so early hor... U c already then say...,dun say early hor already say
4,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
5,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darling week word back like fun st...
6,Even my brother is not like to speak with me. ...,even brother like speak treat like aids patent
7,As per your request 'Melle Melle (Oru Minnamin...,per request melle melle oru minnaminunginte nu...
8,WINNER!! As a valued network customer you have...,winner valued network customer selected receiv...
9,Had your mobile 11 months or more? U R entitle...,mobile months entitled update latest colour mo...


In [30]:
df.to_csv("sms_clean.csv", index=False)
print("Dataset limpio guardado como sms_clean.csv")
# Guardar el dataset limpio

Dataset limpio guardado como sms_clean.csv


In [31]:
print(df['label'].isna().sum())
print(df['clean_text'].isna().sum())
print((df['label']))

0
0
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64


Selección de características con TF-IDF + modelo superficial

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# 1. Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # máximo 5000 palabras
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

print("Matriz TF-IDF:", X.shape)

# 2. Selección de características (chi-cuadrado)
k = 1000  # número de features a conservar
selector = SelectKBest(chi2, k=k)
X_new = selector.fit_transform(X, y)

print("Matriz después de selección:", X_new.shape)

# Obtener las palabras más importantes
selected_features = [vectorizer.get_feature_names_out()[i] for i in selector.get_support(indices=True)]
print("Ejemplo de features seleccionadas:", selected_features[:20])


Matriz TF-IDF: (5572, 5000)
Matriz después de selección: (5572, 1000)
Ejemplo de features seleccionadas: ['ab', 'abta', 'ac', 'access', 'accident', 'account', 'acl', 'aco', 'action', 'activate', 'active', 'adam', 'addamsfa', 'admirer', 'ads', 'adult', 'advise', 'ae', 'ag', 'age']
