# Plantilla de desarrollo para primer examen parcial

**Pautas:**
- La presente plantilla es un EJEMPLO de cómo ordenar el código de tu examen
- Tienes la libertad DE AGREGAR todos los métodos y secciones en el examen que consideres necesarias
- Realizar el desarrollo por medio de métodos, por ejemplo, ReadInfo(), TrainModel(), etc 
- Los métodos deberán de estar lo mas claro y modularizados que sea posible
- Realizar la documentación de cada método por medio de comentarios y DocStrings 
- Deberás de utilizar un modelo de ML o algún ensamble de os mismos (SVC, DT, NB, KNN, etc)
- Recuerda que puedes usar un split de los datos para entrenamiento y validación
- Puedes revisar la documentación de Sklearn, o la librría que decidas utilizar para entender los parámetros de entrenamiento de los modelos
- NO está permitido el uso de modelos de Deep Learning (DNN, CNN, LSTM, etc.) NI el uso de embeddings

## Librerías a utilizar

In [18]:
# Librerías profe
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

# Librerías actuales
import pandas as pd
import random
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score

nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /home/eubgo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eubgo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/eubgo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Lectura de Dataset

In [19]:
# Aquí cargamos la información del DataSet de entrenamiento
def read_corpus(path):
    """Este método lee los de datos del corpus y los pasa a un dataFrame

    Args:
        path (string): Ubicación del archivo de entrada (Corpus)
    """
    df = pd.read_csv(path)
    df.index = np.arange(1, len(df) + 1)
    print("Elementos en el DataSet:", len(df))
    return df

## Feature Engineering

In [20]:
from textblob import TextBlob

def feature_engineering(df):
    # Longitud de la noticia
    df['length'] = df['title'].apply(len)

    # Cantidad de palabras únicas
    df['unique_words'] = df['title'].apply(lambda x: len(set(x.split())))

    # Presencia de números
    df['numbers_count'] = df['title'].apply(lambda x: sum(c.isdigit() for c in x))

    # Presencia de signos de exclamación
    df['exclamation_count'] = df['title'].apply(lambda x: x.count('!'))

    # Sentimiento del texto
    df['sentiment'] = df['title'].apply(lambda x: TextBlob(x).sentiment.polarity)

    # Frecuencia de palabras clave (por ejemplo: "top", "best", "first", "most", etc.)
    keywords = ["top", "best", "first", "most", "amazing", "incredible"]
    for keyword in keywords:
        df[f'keyword_{keyword}'] = df['title'].apply(lambda x: x.split().count(keyword))

    return df

## Data Augmentation

In [21]:

def get_synonyms(word):
    """Obtiene sinónimos de una palabra usando WordNet.

    Args:
        word (str): Palabra para la cual obtener sinónimos.

    Returns:
        list: Lista de sinónimos.
    """
    from nltk.corpus import wordnet
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def replace_with_synonym(sentence):
    """Reemplaza palabras en una oración con un sinónimo aleatorio.

    Args:
        sentence (str): Oración a modificar.

    Returns:
        str: Oración modificada.
    """
    words = sentence.split()
    for i, word in enumerate(words):
        synonyms = get_synonyms(word)
        if synonyms:
            words[i] = random.choice(synonyms).replace("_", " ")
    return ' '.join(words)

def augment(df):
    df_augmented = df.copy()
    clickbait_rows = df_augmented[df_augmented["label"] == "clickbait"]
    clickbait_rows["title"] = clickbait_rows["title"].apply(replace_with_synonym)
    df_combined = pd.concat([df, clickbait_rows], ignore_index=True)
    return df_combined


## Data reduction

In [22]:
def dataReduction(df): # Descartado por accuracy tan bajo
    """Método para reducir el número de datos en el dataFrame e igualar news y clickbait

    Args:
        df (dataframe): Dataframe a reducir

    Returns:
        df: Dataframe reducido
    """
    # Recalcular las variables news y clickbait después de leer el nuevo DataFrame
    news = df[df["label"] == "news"]
    clickbait = df[df["label"] == "clickbait"]

    # Reducir news para igualar el número de news y clickbait
    news = news.sample(len(clickbait))
    df = pd.concat([news, clickbait], ignore_index=True)

    return df

## Preprocesamiento

In [23]:
def preprocess(df):
    """Método para preprocesar el texto y agregar características adicionales

    Args:
        df (dataframe): Dataframe a aplicar transformaciones

    Returns:
        dataframe: Dataframe transformado con características adicionales
    """
    
    for column in df.columns:
        df[column] = df[column].str.lower()

    stop_words = stopwords.words("english")
    df["title"] = df["title"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))

    df["title"] = df["title"].apply(nltk.word_tokenize)

    lemmatizer = WordNetLemmatizer()

    df["title"] = df["title"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x if not word.isdigit()])
    
    df = feature_engineering(df)

    return df


## Entrenamiento del modelo

In [24]:
# def train_model(x, y):
#     """Este método realiza el entrenamiento del modelo (Ejemplo)

#     Args:
#         x (list): Lista con los textos a transformar
#         y (list): Lista con los valores de y (Salida)

#     Returns:
#         model: Modelo entrenado
#     """
#     # Parámetros para GridSearchCV
#     param_grid = {
#         'n_estimators': [100, 200, 300],
#         'max_depth': [None, 10, 20, 30],
#         'min_samples_split': [2, 5, 10]
#     }

#     grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
#     grid_search.fit(x, y)

#     # Entrenando el modelo con los mejores parámetros
#     best_model = grid_search.best_estimator_
#     best_model.fit(x, y)
#     return best_model

def train_model(x, y):
    """Este método realiza el entrenamiento del modelo

    Args:
        x (array): Matriz de características (TF-IDF)
        y (list): Lista con los valores de y (Salida)

    Returns:
        model: Modelo entrenado de Regresión Logística
    """
    model = LogisticRegression()
    model.fit(x, y)
    return model


## Validación del modelo

In [25]:
def validate_model(y_test, predicciones):
    # Impresión de matriz de confusión
    # print("Matriz de confusión:")
    # print(confusion_matrix(Y_test, Predicciones))

    # Impresión de procentaje de Accuracy del modelo
    print("\nAccuracy del modelo: ")
    print(metrics.accuracy_score(y_test, predicciones))

    # Impresión de las métricas para el modelo
    print("\nMétricas de evaluación:")
    print(classification_report(y_test, predicciones))

## Pipeline de todo el proceso

In [26]:
# path = 'DataSet para entrenamiento del modelo.csv'

# # Carga la información y crea un DataFrame
# df = read_corpus(path)

# # Data augmentation (si lo deseas)
# for i in range(2):
#     df = augment(df)

# # Preprocesamiento
# df_pre = preprocess(df)

# # División de los datos
# X_text = df_pre['title'].values.tolist()
# X_text = [' '.join(doc) for doc in X_text]
# y = df_pre['label'].values.tolist()

# # Crea el vectorizador TF-IDF
# tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
# X_tfidf = tfidf_vect.fit_transform(X_text)

# # Añade las características adicionales (longitud del texto y diversidad léxica)
# df_pre['text_length'] = df_pre['title'].apply(len)
# df_pre['lexical_diversity'] = df_pre['title'].apply(lexical_diversity)

# # Divide los datos en conjuntos de entrenamiento y prueba
# X_train_tfidf, X_test_tfidf, X_train_feats, X_test_feats, y_train, y_test = train_test_split(
#     X_tfidf, df_pre[['text_length', 'lexical_diversity']].values, y,  train_size=0.2, test_size=0.1, random_state=42)

# # Combina las características TF-IDF y las características adicionales
# X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_feats))
# X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_feats))

# # Entrena el modelo (en este caso, Regresión Logística)
# model = train_model(X_train_combined, y_train)

# # Realiza predicciones
# y_pred = model.predict(X_test_combined)

# # Calculate and display accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

# # Evalúa el modelo
# print(classification_report(y_test, y_pred))

# # Ideas pensadas
# # Logistic + DA + FE2 = 0.80
# # Logistic + DA = 8'
# # Random Forest solito
# # Random Forest con GridSearch
# # Random Forest con  RandomizedSearch

# path = 'DataSet para entrenamiento del modelo.csv'

# # Carga la información y crea un DataFrame
# df = read_corpus(path)

# # Data augmentation (si lo deseas)
# for i in range(2):
#     df = augment(df)

# # Preprocesamiento
# df_pre = preprocess(df)

# # División de los datos
# X_text = df_pre['title'].values.tolist()
# X_text = [' '.join(doc) for doc in X_text]
# y = df_pre['label'].values.tolist()

# # Crea el vectorizador TF-IDF
# tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
# X_tfidf = tfidf_vect.fit_transform(X_text)

# # Divide los datos en conjuntos de entrenamiento y prueba
# X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
#     X_tfidf, y,  train_size=0.2, test_size=0.1, random_state=42)

# # Entrena el modelo (en este caso, Regresión Logística)
# model = train_model(X_train_tfidf, y_train)

# # Realiza predicciones
# y_pred = model.predict(X_test_tfidf)

# # Calculate and display accuracy
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

# # Evalúa el modelo
# print(classification_report(y_test, y_pred))






## Guardado de modelo

In [27]:
# # Pickle para guardar modelos
# import pickle

# filename = "model_KikeMau.pickle"

# # Guardar el modelo
# pickle.dump(model, open(filename, "wb"))


## Prueba del modelo (Parte mas importante)

In [30]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(df):
    """Método para preprocesar el texto

    Args:
        df (dataframe): Dataframe a aplicar transformaciones

    Returns:
        dataframe: Dataframe transformado
    """
    
    for column in df.columns:
        df[column] = df[column].str.lower()

    stop_words = stopwords.words("english")
    df["title"] = df["title"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
    df["title"] = df["title"].apply(nltk.word_tokenize)

    lemmatizer = WordNetLemmatizer()
    df["title"] = df["title"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x if not word.isdigit()])
    
    return df

# Carga de datos y preprocesamiento
path = 'DataSet para entrenamiento del modelo.csv'
df = pd.read_csv(path)

# Preprocesamiento
df_pre = preprocess(df)

# Vectorización del texto
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
X = tfidf_vect.fit_transform(df_pre['title'].apply(' '.join))
y = df_pre['label'].values.tolist()

# División de los datos
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.05, test_size=0.01, random_state=42)

# Definir el modelo Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Actualizar los hiperparámetros para Grid Search y Randomized Search
rf_param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

# Definir el modelo Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)

# Actualizar los hiperparámetros para Grid Search y Randomized Search
gb_param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Grid Search para Random Forest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

best_rf_model = rf_grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)

rf_grid_accuracy = best_rf_model.score(X_test, y_test)

# Grid Search para Gradient Boosting
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

best_gb_model = gb_grid_search.best_estimator_
best_gb_model.fit(X_train, y_train)

gb_grid_accuracy = best_gb_model.score(X_test, y_test)

print(f'Random Forest Grid Search Accuracy: {rf_grid_accuracy:.2f}')
print(f'Gradient Boosting Grid Search Accuracy: {gb_grid_accuracy:.2f}')

# Validación cruzada para Random Forest
rf_cross_val_scores = cross_val_score(best_rf_model, X, y, cv=10, scoring='accuracy')
print(f"Random Forest Accuracy con validación cruzada: {rf_cross_val_scores.mean():.2f} +/- {rf_cross_val_scores.std():.2f}")

# Validación cruzada para Gradient Boosting
gb_cross_val_scores = cross_val_score(best_gb_model, X, y, cv=10, scoring='accuracy')
print(f"Gradient Boosting Accuracy con validación cruzada: {gb_cross_val_scores.mean():.2f} +/- {gb_cross_val_scores.std():.2f}")


Fitting 10 folds for each of 960 candidates, totalling 9600 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   2.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   3.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.9s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time

KeyboardInterrupt: 

In [31]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess(df):
    """Método para preprocesar el texto y agregar características adicionales

    Args:
        df (dataframe): Dataframe a aplicar transformaciones

    Returns:
        dataframe: Dataframe transformado con características adicionales
    """
    
    for column in df.columns:
        df[column] = df[column].str.lower()

    stop_words = stopwords.words("english")
    df["title"] = df["title"].apply(lambda x: " ".join([word for word in x.split() if word not in stop_words]))
    df["title"] = df["title"].apply(nltk.word_tokenize)

    lemmatizer = WordNetLemmatizer()
    df["title"] = df["title"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x if not word.isdigit()])
    
    df = feature_engineering(df)

    return df

def feature_engineering(df):
    # Longitud de la noticia
    df['news_length'] = df['title'].apply(len)

    # Cantidad de palabras únicas
    df['unique_words'] = df['title'].apply(lambda x: len(set(x)))

    # Presencia de números
    df['has_numbers'] = df['title'].apply(lambda x: int(any(char.isdigit() for char in x)))

    # Presencia de signos de exclamación
    df['has_exclamation'] = df['title'].apply(lambda x: int('!' in x))

    # Aquí puedes agregar más características como el sentimiento del texto y la frecuencia de palabras clave

    return df

# Carga de datos y preprocesamiento
path = 'DataSet para entrenamiento del modelo.csv'
df = pd.read_csv(path)

# Data augmentation (si lo deseas)
for i in range(2):
    df = augment(df)

# Preprocesamiento
df_pre = preprocess(df)

# Características numéricas
features_to_scale = ['news_length', 'unique_words', 'has_numbers', 'has_exclamation']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_pre[features_to_scale])

# Vectorización del texto
tfidf_vect = TfidfVectorizer(ngram_range=(1, 2))
X_text = tfidf_vect.fit_transform(df_pre['title'].apply(' '.join))

# Combinar características vectorizadas y numéricas
X_combined = hstack([X_text, scaled_features])

# División de los datos
y = df_pre['label'].values.tolist()
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, train_size=0.05, test_size=0.01, random_state=42)

# Definir el modelo Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Actualizar los hiperparámetros para Grid Search y Randomized Search
rf_param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}

# Definir el modelo Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=42)

# Actualizar los hiperparámetros para Grid Search y Randomized Search
gb_param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Grid Search para Random Forest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

best_rf_model = rf_grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)

rf_grid_accuracy = best_rf_model.score(X_test, y_test)

# Grid Search para Gradient Boosting
gb_grid_search = GridSearchCV(estimator=gb_model, param_grid=gb_param_grid, cv=10, scoring='accuracy', verbose=2, n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

best_gb_model = gb_grid_search.best_estimator_
best_gb_model.fit(X_train, y_train)

gb_grid_accuracy = best_gb_model.score(X_test, y_test)

print(f'Random Forest Grid Search Accuracy: {rf_grid_accuracy:.2f}')
print(f'Gradient Boosting Grid Search Accuracy: {gb_grid_accuracy:.2f}')

# Validación cruzada para Random Forest
rf_cross_val_scores = cross_val_score(best_rf_model, X_combined, y, cv=10, scoring='accuracy')
print(f"Random Forest Accuracy con validación cruzada: {rf_cross_val_scores.mean():.2f} +/- {rf_cross_val_scores.std():.2f}")

# Validación cruzada para Gradient Boosting
gb_cross_val_scores = cross_val_score(best_gb_model, X_combined, y, cv=10, scoring='accuracy')
print(f"Gradient Boosting Accuracy con validación cruzada: {gb_cross_val_scores.mean():.2f} +/- {gb_cross_val_scores.std():.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clickbait_rows["title"] = clickbait_rows["title"].apply(replace_with_synonym)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clickbait_rows["title"] = clickbait_rows["title"].apply(replace_with_synonym)


Fitting 10 folds for each of 960 candidates, totalling 9600 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  15.5s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  17.7s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  18.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  19.3s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  18.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  19.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  19.6s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time

KeyboardInterrupt: 