In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np

# Load the data
train_data = pd.read_csv('./input/train.csv')

# Fill missing values
train_data['keyword'].fillna('', inplace=True)
train_data['location'].fillna('', inplace=True)

# Define features and target
X = train_data[['text', 'keyword', 'location']]
y = train_data['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the preprocessor
categorical_features = ['keyword', 'location']
text_features = 'text'

# OneHotEncoder setup
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# TF-IDF setup
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# ColumnTransformer to apply OneHotEncoder to categorical features and TF-IDF to text
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features),
        ('text', tfidf_vectorizer, text_features)
    ],
    remainder='drop'
)

# Define the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, n_jobs=-1))
])

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

# Explanation:
# 1. We use OneHotEncoder without the 'sparse' argument, as it is deprecated in newer versions of scikit-learn.
# 2. We use TfidfVectorizer to transform the text data into numerical features.
# 3. We use a Logistic Regression model with n_jobs=-1 to utilize all available CPU cores for training.
# 4. We evaluate the model using the F1 score, which is the challenge's evaluation metric.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['keyword'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['location'].fillna('', inplace=True)


F1 Score: 0.7429943955164131


In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
import numpy as np
import transformers

# Custom transformer for extracting features from text using BERT
class BERTTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='bert-base-uncased', max_length=128):
        self.tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
        self.model = transformers.BertModel.from_pretrained(model_name)
        self.max_length = max_length

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        inputs = self.tokenizer(X.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=self.max_length)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Function to extract keyword and location correlation features
def extract_correlation_features(X):
    keyword_corr = X['keyword'].apply(lambda x: len(x))  # Example: length of keyword
    location_corr = X['location'].apply(lambda x: len(x))  # Example: length of location
    return np.vstack((keyword_corr, location_corr)).T

# Define the feature extraction pipeline
feature_extraction_pipeline = FeatureUnion(
    transformer_list=[
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
        ('bert', BERTTransformer()),
        ('correlation', FunctionTransformer(extract_correlation_features, validate=False))
    ]
)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance

# Combine feature extraction and dimensionality reduction
full_pipeline = Pipeline(steps=[
    ('features', feature_extraction_pipeline),
    ('scaler', StandardScaler()),
    ('pca', pca)
])

# Transform the training data
X_train_transformed = full_pipeline.fit_transform(X_train['text'])

# Explanation:
# 1. We use TF-IDF and BERT for feature extraction from text, capturing both frequency and semantic meaning.
# 2. We create a custom transformer for BERT to extract contextual features from text.
# 3. We analyze 'keyword' and 'location' for correlation with the target by creating simple features based on their length.
# 4. We apply PCA to reduce the dimensionality of the feature space, retaining 95% of the variance.
# 5. We normalize the features using StandardScaler to ensure consistent input scales for the model.

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'Pipeline' is not defined

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Define models to evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, n_jobs=-1, class_weight='balanced'),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(kernel='linear', class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_jobs=-1, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Evaluate models using cross-validation
for model_name, model in models.items():
    scores = cross_val_score(model, X_train_transformed, y_train, cv=5, scoring='f1')
    print(f"{model_name} F1 Score: {scores.mean():.4f} ± {scores.std():.4f}")

# Explanation:
# 1. We use Logistic Regression, Naive Bayes, and SVM as traditional machine learning models for text classification.
# 2. We include Random Forest and Gradient Boosting as ensemble methods to establish baseline performance.
# 3. We use class_weight='balanced' to handle class imbalance in the target variable.
# 4. We evaluate models using cross-validation with F1 score as the metric, which is suitable for imbalanced datasets.

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch import nn, optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertModel

# Define a simple BERT-based classifier
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        return linear_output

# Prepare data for PyTorch
X_train, X_val, y_train, y_val = train_test_split(X_train['text'], y_train, test_size=0.2, stratify=y_train)

# Tokenize and encode sequences in the training set
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128, return_tensors='pt')

# Create TensorDatasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val.values))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize model, optimizer, and loss function
model = BERTClassifier()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop with early stopping
num_epochs = 5
best_val_f1 = 0
patience = 2
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Training loss: {total_loss/len(train_loader):.4f}")

    # Validation
    model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    print(f"Epoch {epoch+1}, Validation F1 Score: {val_f1:.4f}")

    # Early stopping
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))

# Explanation:
# 1. We use a BERT-based classifier for text classification, leveraging pre-trained language models.
# 2. We tokenize and encode the text data using BERT tokenizer, preparing it for input to the model.
# 3. We implement a training loop with early stopping based on validation F1 score to prevent overfitting.
# 4. We use Adam optimizer with a learning rate of 2e-5, suitable for fine-tuning BERT models.
# 5. We log training loss and validation F1 score for each epoch to monitor model performance.

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Function to evaluate model performance
def evaluate_model_performance(model, X_val, y_val):
    # Get predictions and prediction probabilities
    y_pred = model.predict(X_val)
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_val)[:, 1]
    else:
        y_proba = model.decision_function(X_val)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)

    # Print evaluation metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    print("Confusion Matrix:")
    print(cm)

# Example usage with a trained model and validation data
# evaluate_model_performance(trained_model, X_val_transformed, y_val)

# Explanation:
# 1. We define a function `evaluate_model_performance` to calculate and print various evaluation metrics.
# 2. We use scikit-learn's metrics functions to compute accuracy, precision, recall, F1-score, and AUC.
# 3. We handle both probability outputs and decision scores for AUC calculation, depending on the model.
# 4. We print the confusion matrix to help identify common misclassification errors.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import make_scorer, f1_score
import numpy as np

# Define a scorer for F1 score
f1_scorer = make_scorer(f1_score, average='weighted')

# Define hyperparameter grids for different models
param_grid_lr = {
    'clf__C': np.logspace(-4, 4, 20),
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear']
}

param_grid_svc = {
    'clf__C': np.logspace(-4, 4, 20),
    'clf__kernel': ['linear', 'rbf']
}

param_grid_rf = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10]
}

param_grid_nb = {
    'clf__alpha': np.logspace(-4, 1, 20)
}

# Define pipelines for each model
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000))
])

pipeline_svc = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', SVC(class_weight='balanced'))
])

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', RandomForestClassifier(class_weight='balanced', n_jobs=-1))
])

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])

# Perform Randomized Search CV for each model
random_search_lr = RandomizedSearchCV(pipeline_lr, param_grid_lr, n_iter=50, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)
random_search_svc = RandomizedSearchCV(pipeline_svc, param_grid_svc, n_iter=50, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)
random_search_rf = RandomizedSearchCV(pipeline_rf, param_grid_rf, n_iter=50, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)
random_search_nb = RandomizedSearchCV(pipeline_nb, param_grid_nb, n_iter=50, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)

# Fit models
random_search_lr.fit(X_train['text'], y_train)
random_search_svc.fit(X_train['text'], y_train)
random_search_rf.fit(X_train['text'], y_train)
random_search_nb.fit(X_train['text'], y_train)

# Print best parameters and scores
print("Logistic Regression best params:", random_search_lr.best_params_)
print("Logistic Regression best F1 score:", random_search_lr.best_score_)

print("SVC best params:", random_search_svc.best_params_)
print("SVC best F1 score:", random_search_svc.best_score_)

print("Random Forest best params:", random_search_rf.best_params_)
print("Random Forest best F1 score:", random_search_rf.best_score_)

print("Naive Bayes best params:", random_search_nb.best_params_)
print("Naive Bayes best F1 score:", random_search_nb.best_score_)

# Explanation:
# 1. We use RandomizedSearchCV to perform hyperparameter tuning for different models.
# 2. We define parameter grids for Logistic Regression, SVC, Random Forest, and Naive Bayes.
# 3. We use a pipeline to integrate TF-IDF vectorization and model training.
# 4. We use F1 score as the evaluation metric, suitable for imbalanced datasets.
# 5. We print the best parameters and corresponding F1 scores for each model.

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
import numpy as np

# Load test data
# Assuming test data is loaded in a DataFrame called `test_data`

# Handle missing values
imputer = SimpleImputer(strategy='constant', fill_value='missing')
test_data['keyword'] = imputer.fit_transform(test_data[['keyword']])
test_data['location'] = imputer.fit_transform(test_data[['location']])

# Encode categorical data
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_keywords = encoder.fit_transform(test_data[['keyword']])
encoded_locations = encoder.fit_transform(test_data[['location']])

# Feature extraction from text
vectorizer = TfidfVectorizer(max_features=5000)
X_test_text = vectorizer.fit_transform(test_data['text'])

# Combine all features
X_test_combined = np.hstack((X_test_text.toarray(), encoded_keywords, encoded_locations))

# Load the best model
# Assuming the best model is saved as `best_model.pkl`
import joblib
best_model = joblib.load('best_model.pkl')

# Predict on test data
predictions = best_model.predict(X_test_combined)

# Post-process predictions if necessary
# For example, adjust threshold if needed
# predictions = (best_model.predict_proba(X_test_combined)[:, 1] > 0.5).astype(int)

# Prepare submission
submission = pd.DataFrame({'id': test_data['id'], 'target': predictions})
submission.to_csv('./output/submission.csv', index=False)

# Explanation:
# 1. We handle missing values in 'keyword' and 'location' using SimpleImputer.
# 2. We encode 'keyword' and 'location' using OneHotEncoder.
# 3. We extract features from 'text' using TF-IDF.
# 4. We load the best model and predict on the test data.
# 5. We prepare the submission file according to the competition's requirements.