In [8]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import tqdm
from tqdm import tqdm  # Para usar tqdm directamente en loops

# Herramientas de NLP de NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Herramientas de Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Descarga de recursos NLTK si no est√°n instalados
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Krishna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# Ajusta la ruta del archivo si est√° en otra carpeta
df = pd.read_csv("training.1600000.processed.noemoticon.csv", 
                 encoding='latin1', 
                 header=None)

# Observa que este dataset normalmente NO tiene cabeceras;
# las columnas suelen ser: [0: Sentimiento, 1: ID, 2: Fecha, 3: Query, 4: Usuario, 5: Tweet]
# Asegur√©monos de quedarnos solo con Sentimiento (col 0) y Texto (col 5)

df = df[[0,5]]  # Tomamos √∫nicamente la 1ra y la √∫ltima
df.columns = ['sentiment', 'tweet']  # Renombramos
df.head()


Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [12]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 4 else 0)
df.head()


Unnamed: 0,sentiment,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [14]:
def limpiar_texto(text):
    # Manejar valores faltantes
    if pd.isna(text):
        return ""

    # 1. Convertir a min√∫sculas
    text = text.lower()

    # 2. Eliminar URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # 3. Eliminar menciones (@usuario)
    text = re.sub(r"@\w+", '', text)

    # 4. Eliminar d√≠gitos y caracteres de puntuaci√≥n
    text = re.sub(r"\d+", '', text)  # quita n√∫meros
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 5. Tokenizar
    tokens = word_tokenize(text)

    # 6. Eliminar palabras vac√≠as (stopwords)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 7. Lematizar
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Unir tokens limpios en un solo string
    return " ".join(tokens)


In [16]:
# Activar barra de progreso de tqdm para m√©todos de pandas
tqdm.pandas()

df['tweet'] = df['tweet'].progress_apply(limpiar_texto)
df.head()


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1600000/1600000 [07:21<00:00, 3625.09it/s]


Unnamed: 0,sentiment,tweet
0,0,thats bummer shoulda got david carr third day
1,0,upset cant update facebook texting might cry r...
2,0,dived many time ball managed save rest go bound
3,0,whole body feel itchy like fire
4,0,behaving im mad cant see


In [17]:
# Limitamos el n√∫mero de features para que no estalle la RAM
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(df['tweet'])  # X ser√° una sparse matrix
y = df['sentiment']  # Etiquetas

# Opcional: convertir en DataFrame si lo deseas, aunque lo usual es mantenerlo como sparse
# tfidf_df = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())
# tfidf_df.to_csv("tweets_tfidf.csv", index=False)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)

print(f"Datos de entrenamiento: {X_train.shape}")
print(f"Datos de prueba: {X_test.shape}")


Datos de entrenamiento: (1280000, 5000)
Datos de prueba: (320000, 5000)


In [19]:
modelo = LogisticRegression(max_iter=1000)
modelo.fit(X_train, y_train)

print("‚úÖ Modelo de Regresi√≥n Log√≠stica entrenado correctamente.")


‚úÖ Modelo de Regresi√≥n Log√≠stica entrenado correctamente.


In [20]:
# 7.1. Predicciones
y_pred = modelo.predict(X_test)

# 7.2. Precisi√≥n
accuracy = accuracy_score(y_test, y_pred)
print(f"üéØ Precisi√≥n del modelo: {accuracy:.4f}")

# 7.3. Reporte de Clasificaci√≥n
print("\nüìä Reporte de Clasificaci√≥n:\n", classification_report(y_test, y_pred))


üéØ Precisi√≥n del modelo: 0.7736

üìä Reporte de Clasificaci√≥n:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

