# XG-Boost

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import json

DATA_DIR = Path('../data/nlp-getting-started')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
LOCATION_TO_COUNTRY_PATH = Path('../data/location_to_country.json')
RANDOM_SEED = 27

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

with open(LOCATION_TO_COUNTRY_PATH, 'r', encoding='utf-8') as f:
    location_to_country = json.load(f)

# Categóricas: 'country',  'keyword'. Después las voy a mean encodear.
train_df['country'] = train_df['location'].map(location_to_country).fillna('unknown')
test_df['country'] = test_df['location'].map(location_to_country).fillna('unknown')

train_df['keyword'] = train_df['keyword'].fillna('missing')
test_df['keyword'] = test_df['keyword'].fillna('missing')


categorical_features = ['country', 'keyword']

# Numéricas: 'text_length', 'num_hashtags', 'num_mentions', 'num_uppercase_per_word', 'sentiment_score', 'has_url'

# one hot encoding de 'has_url' a mano
train_df['has_url'] = train_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(int)
test_df['has_url'] = test_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(int)

train_df['text_length'] = train_df['text'].fillna('').str.len()
test_df['text_length'] = test_df['text'].fillna('').str.len()

train_df['num_hashtags'] = train_df['text'].str.count('#')
train_df['num_mentions'] = train_df['text'].str.count('@')

test_df['num_hashtags'] = test_df['text'].str.count('#')
test_df['num_mentions'] = test_df['text'].str.count('@')

def uppercase_per_word(text):
    text = str(text)

    # Palabras que tengan al menos una letra alfabética
    words = [w for w in text.split() if any(ch.isalpha() for ch in w)]
    if not words:
        return 0.0

    # Solo letras alfabéticas, para evitar que cuenten símbolos raros
    uppercase_letters = sum(ch.isupper() for ch in text if ch.isalpha())
    return uppercase_letters / len(words)


train_df['num_uppercase_per_word'] = train_df['text'].apply(uppercase_per_word)
test_df['num_uppercase_per_word']  = test_df['text'].apply(uppercase_per_word)

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return 0.5
    compound = analyzer.polarity_scores(text)['compound']
    return (compound + 1) / 2

train_df['sentiment_score'] = train_df['text'].apply(get_sentiment)
test_df['sentiment_score'] = test_df['text'].apply(get_sentiment)

numeric_features = ['text_length', 'num_hashtags', 'num_mentions', 'num_uppercase_per_word', 'sentiment_score', 'has_url']

embedding_feature = 'text'

# 1. Separar features y target
X = train_df[numeric_features + categorical_features + [embedding_feature]].copy()
y = train_df['target'].copy()

# 2. Split estratificado train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

X_train.head()

Unnamed: 0,text_length,num_hashtags,num_mentions,num_uppercase_per_word,sentiment_score,has_url,country,keyword,text
2721,87,1,0,1.0,0.18755,1,unknown,devastated,Obama declares disaster for typhoon-devastated...
2259,132,0,0,0.083333,0.55135,0,unknown,deluged,Businesses are deluged with invzices. Make you...
1815,136,0,0,1.333333,0.27205,1,United Kingdom,crashed,Neil_Eastwood77: I AM A KNOBHEAD!! Bin Laden f...
682,139,0,0,1.666667,0.5,1,unknown,blazing,Morgan Silver Dollar 1880 S Gem BU DMPL Cameo ...
7216,121,0,2,0.210526,0.78595,0,United States of America,weapons,@danagould @WaynesterAtl I agree with backgrou...


In [2]:
from sklearn.model_selection import KFold

def kfold_target_encoding(train_series, target_series, n_splits=5, random_state=RANDOM_SEED):
    encoded = pd.Series(np.nan, index=train_series.index, dtype=float)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    global_mean = target_series.mean()

    for train_idx, val_idx in kf.split(train_series):
        fold_df = pd.DataFrame({
            'feature': train_series.iloc[train_idx],
            'target': target_series.iloc[train_idx]
        })
        means = fold_df.groupby('feature')['target'].mean()
        encoded.iloc[val_idx] = train_series.iloc[val_idx].map(means)

    encoded.fillna(global_mean, inplace=True)

    full_df = pd.DataFrame({'feature': train_series, 'target': target_series})
    mapping = full_df.groupby('feature')['target'].mean()

    return encoded, mapping, global_mean

mean_encoded_features = []

for column in ['country', 'keyword']:
    train_encoded, mapping, global_mean = kfold_target_encoding(
        X_train[column], y_train
    )
    encoded_col = f'{column}_target_mean'
    # agrego columnas nuevas a los splits
    X_train[encoded_col] = train_encoded
    X_val[encoded_col]   = X_val[column].map(mapping).fillna(global_mean)
    test_df[encoded_col] = test_df[column].map(mapping).fillna(global_mean)

    mean_encoded_features.append(encoded_col)


numeric_features = numeric_features + mean_encoded_features


X_train[numeric_features].head()

X_train_numeric = X_train[numeric_features].to_numpy()
X_val_numeric   = X_val[numeric_features].to_numpy()
X_test_numeric  = test_df[numeric_features].to_numpy()

In [3]:
from sklearn.decomposition import TruncatedSVD
N_COMPONENTS = 30 # Voy a tener que jugar un poco con este hiperparámetro

# Transformer para texto (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english'
)

X_train_text = tfidf_vectorizer.fit_transform(X_train[embedding_feature].fillna(''))
X_val_text = tfidf_vectorizer.transform(X_val[embedding_feature].fillna(''))
X_test_text = tfidf_vectorizer.transform(test_df[embedding_feature].fillna(''))

print(f"\nTF-IDF vectorizer:")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10].tolist()}")


svd = TruncatedSVD(
    n_components=N_COMPONENTS,
    random_state=RANDOM_SEED
)

X_train_text_15 = svd.fit_transform(X_train_text)
X_val_text_15   = svd.transform(X_val_text)
X_test_text_15  = svd.transform(X_test_text)

X_train_combined = np.hstack([X_train_numeric, X_train_text_15])
X_val_combined   = np.hstack([X_val_numeric,   X_val_text_15])
X_test_combined  = np.hstack([X_test_numeric,  X_test_text_15])

print(f"  Test shape: {X_test_combined.shape}")

pd.DataFrame(X_train_combined).head()


TF-IDF vectorizer:
  Vocabulary size: 5000
  Feature names (first 10): ['0', '00', '00 http', '00 pm', '000', '01', '01 04', '02', '03', '04']
  Test shape: (3263, 38)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,87.0,1.0,0.0,1.0,0.18755,1.0,0.415596,0.5,0.130693,-0.023265,...,-0.090866,-0.0453,0.027946,0.037595,-0.019196,-0.085282,0.077331,-0.027872,-0.043564,0.069854
1,132.0,0.0,0.0,0.083333,0.55135,0.0,0.415596,0.357143,0.006938,0.004409,...,0.009903,-0.005538,0.015668,-0.004812,-0.018981,-0.005875,-0.022296,0.027726,-0.019433,0.005867
2,136.0,0.0,0.0,1.333333,0.27205,1.0,0.367647,0.666667,0.145825,-0.009432,...,-0.039281,-0.028005,-0.071769,0.029838,-0.019939,0.017203,0.001472,-0.034268,-0.009915,-0.017328
3,139.0,0.0,0.0,1.666667,0.5,1.0,0.420105,0.0,0.226158,-0.021638,...,-0.059294,-0.021895,-0.059715,0.032109,-0.033443,-0.027209,0.006836,-0.032193,-0.021932,-0.039429
4,121.0,0.0,2.0,0.210526,0.78595,0.0,0.428266,0.44,0.017488,0.026948,...,-0.011794,0.015972,0.030408,0.008292,-0.035395,0.049585,0.036662,0.024313,-0.022074,-0.029951


In [5]:
# ! pip install xgboost

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Modelo base
xgb_base = XGBClassifier(
    objective='binary:logistic',      # Para clasificación binaria
    eval_metric='logloss',            
    random_state=RANDOM_SEED,
    n_jobs=-1,
    tree_method='hist',             
)

# Grid de hiper-parámetros
param_grid_xgb = {
    'n_estimators':    [100, 300, 500],
    'max_depth':       [3, 6, 10],
    'learning_rate':   [0.01, 0.05, 0.1],
    'subsample':       [0.7, 0.9, 1],
    'colsample_bytree':[0.5, 0.7, 1],
    'gamma':           [0, 1, 5],         # Umbral para realizar una partición
    'reg_alpha':       [0, 0.1, 1],       # L1 (α) → sparsity
    'reg_lambda':      [1, 5, 10]         # L2 (λ) → evitar overfitting
}

grid_search_xgb = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid_xgb,
    scoring='f1',   
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Entrenamos
grid_search_xgb.fit(X_train_combined, y_train)

print(" Mejores hiper-parámetros encontrados para XGBoost:")
print(grid_search_xgb.best_params_)

# Modelo optimizado
best_xgb = grid_search_xgb.best_estimator_

# Predicción y score
y_val_pred = best_xgb.predict(X_val_combined)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 en validación con XGBoost: {f1_val:.4f}")


Fitting 3 folds for each of 6561 candidates, totalling 19683 fits
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   1.2s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=5, subsample=1; total time=   0.1s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.9; total time=   1.3s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=0.7; total time=   1.3s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=5, subsample=0.9; total time=   1.3s
[CV] END colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=100, reg_alpha=0, reg_lambda=1, subsample=1; total time=   1.2s
[CV] END colsample_bytree=

Mmmm sigue siendo bastante malo. No puede ser que sea mejor mi regresión lineal. Voy a probar hacer un Pipeline con el TF-IDF para jugar con esos parámetros también.

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

text_feature = 'text'
feature_columns = numeric_features

preprocessor = ColumnTransformer(
    transformers=[
        ("text", TfidfVectorizer(
            strip_accents="unicode",
            lowercase=True,
            analyzer="word",
            token_pattern=r"\w{1,}",
            stop_words="english"
        ), text_feature),
        
        ("num", "passthrough", feature_columns)   # pasamos las numéricas tal como están
    ],
    remainder="drop"  # aseguramos que lo demás no se use
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",   # más rápido
        n_jobs=-1,
        eval_metric="logloss",
        random_state=RANDOM_SEED
    ))
])


In [17]:
from scipy.stats import randint, uniform

param_distributions = {

    # TF-IDF dentro del ColumnTransformer
    "preprocessor__text__max_features": randint(3000, 9000),
    "preprocessor__text__ngram_range": [(1,1), (1,2)],
    "preprocessor__text__min_df": randint(1, 4),
    "preprocessor__text__sublinear_tf": [True, False],
    "preprocessor__text__smooth_idf": [True, False],

    # XGBoost
    "clf__n_estimators": randint(300, 900),
    "clf__learning_rate": uniform(0.03, 0.12),
    "clf__max_depth": randint(3, 9),
    "clf__subsample": uniform(0.7, 0.3),
    "clf__colsample_bytree": uniform(0.6, 0.4),
    "clf__min_child_weight": randint(1, 6),
}



In [18]:
from sklearn.model_selection import RandomizedSearchCV

rand_search_xgb = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=100,              
    cv=3,
    scoring="f1",
    verbose=2,
    n_jobs=-1,
    random_state=RANDOM_SEED,
)

rand_search_xgb.fit(X_train, y_train)

print("Mejores hiperparámetros encontrados:")
print(rand_search_xgb.best_params_)

best_pipeline_xgb = rand_search_xgb.best_estimator_

# Evaluación rápida
y_val_pred = best_pipeline_xgb.predict(X_val)
print("F1 en validation:", f1_score(y_val, y_val_pred))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END clf__colsample_bytree=0.7702885642075583, clf__learning_rate=0.12775004885934632, clf__max_depth=3, clf__min_child_weight=1, clf__n_estimators=593, clf__subsample=0.8153764018044936, preprocessor__text__max_features=7237, preprocessor__text__min_df=2, preprocessor__text__ngram_range=(1, 1), preprocessor__text__smooth_idf=False, preprocessor__text__sublinear_tf=False; total time=   3.0s
[CV] END clf__colsample_bytree=0.7702885642075583, clf__learning_rate=0.12775004885934632, clf__max_depth=3, clf__min_child_weight=1, clf__n_estimators=593, clf__subsample=0.8153764018044936, preprocessor__text__max_features=7237, preprocessor__text__min_df=2, preprocessor__text__ngram_range=(1, 1), preprocessor__text__smooth_idf=False, preprocessor__text__sublinear_tf=False; total time=   3.2s
[CV] END clf__colsample_bytree=0.7702885642075583, clf__learning_rate=0.12775004885934632, clf__max_depth=3, clf__min_child_weight=1, clf__n_

Bueno, ahora tocamos los valores de la regresión lineal tocando el TF-IDF. Me hace preguntarme si entonces esos parámetros del TF-IDF son lo que me trunca el modelo. Me estuve fijando mucho en la cantidad de features que genera el TF-IDF, acá encontró su mejor con 'preprocessor__text__max_features': 6606, que es un número intermedio. Voy a probar con XGBoost gridsearch en ese número de features fijo.


In [19]:
from pathlib import Path

submissions_dir = Path('../resultados')
submissions_dir.mkdir(parents=True, exist_ok=True)
baseline_submit_path = submissions_dir / 'xgboost.csv'

y_test_pred = best_xgb.predict(X_test_combined)


submission = pd.DataFrame({
    'id': test_df['id'],
    'target': y_test_pred
})

# Guardar CSV
submission.to_csv(baseline_submit_path, index=False)