In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

import json

DATA_DIR = Path('../data/nlp-getting-started')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
LOCATION_TO_COUNTRY_PATH = Path('../data/location_to_country.json')
RANDOM_SEED = 27

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

with open(LOCATION_TO_COUNTRY_PATH, 'r', encoding='utf-8') as f:
    location_to_country = json.load(f)

train_df['country'] = train_df['location'].map(location_to_country)
test_df['country'] = test_df['location'].map(location_to_country)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nCountries in train: {train_df['country'].notna().sum()}/{len(train_df)}")
print(f"Countries in test: {test_df['country'].notna().sum()}/{len(test_df)}")
train_df.head()

train_df['has_url'] = train_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(str)
test_df['has_url'] = test_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(str)

print("has_url distribution in train:")
print(train_df['has_url'].value_counts())
print(f"\nPercentage with URL: {train_df['has_url'].eq('True').mean()*100:.1f}%")

print("\nhas_url distribution in test:")
print(test_df['has_url'].value_counts())
print(f"\nPercentage with URL: {test_df['has_url'].eq('True').mean()*100:.1f}%")

categorical_features = ['country', 'has_url']

train_df['text_length'] = train_df['text'].fillna('').str.len()
test_df['text_length'] = test_df['text'].fillna('').str.len()

print("Text length stats:")
print(f"Train - mean: {train_df['text_length'].mean():.1f}, std: {train_df['text_length'].std():.1f}")
print(f"Test - mean: {test_df['text_length'].mean():.1f}, std: {test_df['text_length'].std():.1f}")

numeric_features = ['text_length']
categorical_features = ['country', 'has_url']
embedding_feature = 'text'

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return 0.5
    compound = analyzer.polarity_scores(text)['compound']
    return (compound + 1) / 2

train_df['sentiment_score'] = train_df['text'].apply(get_sentiment)
test_df['sentiment_score'] = test_df['text'].apply(get_sentiment)

print("Sentiment score stats:")
print(f"Train - mean: {train_df['sentiment_score'].mean():.3f}, std: {train_df['sentiment_score'].std():.3f}")
print(f"Test - mean: {test_df['sentiment_score'].mean():.3f}, std: {test_df['sentiment_score'].std():.3f}")

print("\nSentiment score distribution in train:")
print(train_df['sentiment_score'].describe())

numeric_features = ['text_length', 'sentiment_score']


# 1. Separar features y target
X = train_df[numeric_features + categorical_features + [embedding_feature]].copy()
y = train_df['target'].copy()

# 2. Split estratificado train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Target distribution in train: {y_train.value_counts().to_dict()}")
print(f"Target distribution in val: {y_val.value_counts().to_dict()}")

# Transformer para numéricas
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_val_numeric = scaler.transform(X_val[numeric_features])
X_test_numeric = scaler.transform(test_df[numeric_features])

from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix

# has_url -> OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=True)

X_train_hasurl = ohe.fit_transform(
    X_train[['has_url']].fillna('missing')
)
X_val_hasurl = ohe.transform(
    X_val[['has_url']].fillna('missing')
)
X_test_hasurl = ohe.transform(
    test_df[['has_url']].fillna('missing')
)

print("\nOneHotEncoder (has_url):")
print(f"  categories: {ohe.categories_[0].tolist()}")


# country -> Mean Target Encoding con K-Fold
country_col = 'country'
alpha = 10  # smoothing
global_mean = y_train.mean()

# aseguramos string y categoría "missing"
country_train = X_train[country_col].fillna('missing').astype(str)
country_val = X_val[country_col].fillna('missing').astype(str)
country_test = test_df[country_col].fillna('missing').astype(str)

kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

country_train_enc = pd.Series(index=country_train.index, dtype=float)

for train_idx, holdout_idx in kfold.split(country_train):
    # folds internos para evitar leakage
    ct_train = country_train.iloc[train_idx]
    y_fold = y_train.iloc[train_idx]

    stats = (
        pd.DataFrame({'country': ct_train, 'target': y_fold})
        .groupby('country')['target']
        .agg(['mean', 'count'])
    )
    stats['smoothed'] = (
        stats['mean'] * stats['count'] + alpha * global_mean
    ) / (stats['count'] + alpha)

    enc_values = country_train.iloc[holdout_idx].map(stats['smoothed'])
    country_train_enc.iloc[holdout_idx] = enc_values

# categorías raras que no se mapearon en algún fold
country_train_enc = country_train_enc.fillna(global_mean)

# encoding para val/test: se calcula con TODO el train (ya no hay leakage)
full_stats = (
    pd.DataFrame({'country': country_train, 'target': y_train})
    .groupby('country')['target']
    .agg(['mean', 'count'])
)
full_stats['smoothed'] = (
    full_stats['mean'] * full_stats['count'] + alpha * global_mean
) / (full_stats['count'] + alpha)

country_val_enc = country_val.map(full_stats['smoothed']).fillna(global_mean)
country_test_enc = country_test.map(full_stats['smoothed']).fillna(global_mean)

print("\nMean encoding (country):")
print(f"  global_mean: {global_mean:.4f}")
print(f"  ejemplo valores train: {country_train_enc.head().to_dict()}")

# Pasar estos vectores a matrices sparse columna para combinarlos con hstack
X_train_country_enc = csr_matrix(country_train_enc.values.reshape(-1, 1))
X_val_country_enc   = csr_matrix(country_val_enc.values.reshape(-1, 1))
X_test_country_enc  = csr_matrix(country_test_enc.values.reshape(-1, 1))

# matriz categórica final = [has_url OHE, country_mean]
X_train_cat = hstack([X_train_hasurl, X_train_country_enc])
X_val_cat   = hstack([X_val_hasurl,   X_val_country_enc])
X_test_cat  = hstack([X_test_hasurl,  X_test_country_enc])


# Transformer para texto (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english'
)
X_train_text = tfidf_vectorizer.fit_transform(X_train[embedding_feature].fillna(''))
X_val_text = tfidf_vectorizer.transform(X_val[embedding_feature].fillna(''))
X_test_text = tfidf_vectorizer.transform(test_df[embedding_feature].fillna(''))

print(f"\nTF-IDF vectorizer:")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10].tolist()}")


# 4. Combinar todas las features
X_train_combined = hstack([X_train_numeric, X_train_cat, X_train_text])
X_val_combined = hstack([X_val_numeric, X_val_cat, X_val_text])
X_test_combined = hstack([X_test_numeric, X_test_cat, X_test_text])

print(f"\nCombined feature matrix:")
print(f"  Train shape: {X_train_combined.shape}")
print(f"  Validation shape: {X_val_combined.shape}")
print(f"  Test shape: {X_test_combined.shape}")

