# Random Forest

### Preparamos las features

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import json

DATA_DIR = Path('../data/nlp-getting-started')
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'
LOCATION_TO_COUNTRY_PATH = Path('../data/location_to_country.json')
RANDOM_SEED = 27

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

with open(LOCATION_TO_COUNTRY_PATH, 'r', encoding='utf-8') as f:
    location_to_country = json.load(f)

train_df['country'] = train_df['location'].map(location_to_country)
test_df['country'] = test_df['location'].map(location_to_country)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nCountries in train: {train_df['country'].notna().sum()}/{len(train_df)}")
print(f"Countries in test: {test_df['country'].notna().sum()}/{len(test_df)}")
train_df.head()

train_df['has_url'] = train_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(str)
test_df['has_url'] = test_df['text'].fillna('').str.contains(r'http[s]?://', regex=True).astype(str)

train_df['text_length'] = train_df['text'].fillna('').str.len()
test_df['text_length'] = test_df['text'].fillna('').str.len()

numeric_features = ['text_length']
categorical_features = ['country', 'has_url']
embedding_feature = 'text'

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    if pd.isna(text) or text.strip() == '':
        return 0.5
    compound = analyzer.polarity_scores(text)['compound']
    return (compound + 1) / 2

train_df['sentiment_score'] = train_df['text'].apply(get_sentiment)
test_df['sentiment_score'] = test_df['text'].apply(get_sentiment)

numeric_features = ['text_length', 'sentiment_score']

# 1. Separar features y target
X = train_df[numeric_features + categorical_features + [embedding_feature]].copy()
y = train_df['target'].copy()

# 2. Split estratificado train/validation (80/20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Target distribution in train: {y_train.value_counts().to_dict()}")
print(f"Target distribution in val: {y_val.value_counts().to_dict()}")

# Transformer para num√©ricas
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_features])
X_val_numeric = scaler.transform(X_val[numeric_features])
X_test_numeric = scaler.transform(test_df[numeric_features])

from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix

# Transformer para texto (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='unicode',
    lowercase=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english'
)
X_train_text = tfidf_vectorizer.fit_transform(X_train[embedding_feature].fillna(''))
X_val_text = tfidf_vectorizer.transform(X_val[embedding_feature].fillna(''))
X_test_text = tfidf_vectorizer.transform(test_df[embedding_feature].fillna(''))

print(f"\nTF-IDF vectorizer:")
print(f"  Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"  Feature names (first 10): {tfidf_vectorizer.get_feature_names_out()[:10].tolist()}")


# 4. Combinar todas las features
X_train_combined = hstack([X_train_numeric, X_train_text])
X_val_combined = hstack([X_val_numeric, X_val_text])
X_test_combined = hstack([X_test_numeric, X_test_text])

print(f"\nCombined feature matrix:")
print(f"  Train shape: {X_train_combined.shape}")
print(f"  Validation shape: {X_val_combined.shape}")
print(f"  Test shape: {X_test_combined.shape}")


Train shape: (7613, 6)
Test shape: (3263, 5)

Countries in train: 4060/7613
Countries in test: 882/3263
has_url distribution in train:
has_url
True     3971
False    3642
Name: count, dtype: int64

Percentage with URL: 52.2%

has_url distribution in test:
has_url
True     1731
False    1532
Name: count, dtype: int64

Percentage with URL: 53.0%
Text length stats:
Train - mean: 101.0, std: 33.8
Test - mean: 102.1, std: 34.0
Sentiment score stats:
Train - mean: 0.428, std: 0.232
Test - mean: 0.426, std: 0.229

Sentiment score distribution in train:
count    7613.000000
mean        0.427633
std         0.232416
min         0.005850
25%         0.228850
50%         0.500000
75%         0.538600
max         0.986500
Name: sentiment_score, dtype: float64
Train set: 6090 samples
Validation set: 1523 samples
Target distribution in train: {0: 3473, 1: 2617}
Target distribution in val: {0: 869, 1: 654}

TF-IDF vectorizer:
  Vocabulary size: 5000
  Feature names (first 10): ['0', '00', '00 http', 

AttributeError: 'coo_matrix' object has no attribute 'head'