In [100]:
import spacy
import pandas as pd
import numpy as np
from spacy.util import minibatch
from spacy.training.example import Example
import random
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from joblib import dump

In [63]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [64]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [65]:
train.describe()

Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


In [66]:
train["keyword"].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
sinking                  41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [67]:
train["location"].value_counts()

location
USA                    104
New York                71
United States           50
London                  45
Canada                  29
                      ... 
MontrÌ©al, QuÌ©bec       1
Montreal                 1
ÌÏT: 6.4682,3.18287      1
Live4Heed??              1
Lincoln                  1
Name: count, Length: 3341, dtype: int64

In [68]:
nlp = spacy.blank("en")

In [69]:
text_cat = nlp.add_pipe("textcat")

In [70]:
text_cat.add_label("not_disaster")
text_cat.add_label("disaster")

1

In [71]:
train_texts = train["text"].values
train_labels = [{"cats": {"not_disaster": label == 0, "disaster": label == 1}} for label in train["target"]]

In [72]:
train_data = list(zip(train_texts, train_labels))

In [73]:
len(train_data)

7613

In [74]:
0.2 * len(train_data)

1522.6000000000001

In [75]:
NUM_EPOCHS = 50
PATIENCE = 4

In [76]:
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
last_scores = []
for epoch in range(NUM_EPOCHS):
    random.shuffle(train_data)
    val_data = train_data[:1522]
    train_batches = train_data[1522:]
    batches = minibatch(train_batches, size=10)
    for batch in batches:
        for text, label in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, label)
            nlp.update([example], sgd=optimizer, losses=losses)
    val_docs = [nlp.tokenizer(data[0]) for data in val_data]
    textcat = nlp.get_pipe('textcat')
    probs = textcat.predict(val_docs)
    predicted_labels = probs.argmax(axis=1)
    score = f1_score([1 if data[1]["cats"]["disaster"] else 0 for data in val_data], predicted_labels)
    last_scores.append(score)
    print(f"Loss: {losses['textcat']}, Score: {score}")
    if len(last_scores) == PATIENCE:
        count = 0
        for i in range(PATIENCE - 1):
            if last_scores[0] > last_scores[i + 1]:
                count += 1
        if count == PATIENCE - 1:
            break
        last_scores.pop(0)


[0.6150907354345749]
Loss: 1358.025030974966, Score: 0.6150907354345749
[0.6150907354345749, 0.7142857142857143]
Loss: 2404.181656655929, Score: 0.7142857142857143
[0.6150907354345749, 0.7142857142857143, 0.7752053771471247]
Loss: 3242.1701252296034, Score: 0.7752053771471247
[0.6150907354345749, 0.7142857142857143, 0.7752053771471247, 0.8235294117647058]
Loss: 3955.957848990022, Score: 0.8235294117647058
[0.7142857142857143, 0.7752053771471247, 0.8235294117647058, 0.8546465448768864]
Loss: 4506.599670834137, Score: 0.8546465448768864
[0.7752053771471247, 0.8235294117647058, 0.8546465448768864, 0.8835725677830941]
Loss: 4959.281331146276, Score: 0.8835725677830941
[0.8235294117647058, 0.8546465448768864, 0.8835725677830941, 0.9084139985107967]
Loss: 5328.507772018813, Score: 0.9084139985107967
[0.8546465448768864, 0.8835725677830941, 0.9084139985107967, 0.9179834462001505]
Loss: 5663.874517079755, Score: 0.9179834462001505
[0.8835725677830941, 0.9084139985107967, 0.9179834462001505, 0.

In [77]:
test_docs = [nlp.tokenizer(text) for text in test["text"]]
textcat = nlp.get_pipe('textcat')
probs = textcat.predict(test_docs)
predicted_labels = probs.argmax(axis=1)

In [78]:
predicted_labels

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [79]:
submission = pd.concat([test["id"], pd.Series(predicted_labels)], axis=1)
submission.columns = ["id", "target"]

In [80]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,0
3,9,0
4,11,1


In [81]:
submission["target"].value_counts()

target
0    2042
1    1221
Name: count, dtype: int64

In [82]:
submission.to_csv("data/submission.csv", index=False)

In [87]:
nlp = spacy.load('en_core_web_lg')

In [93]:
with nlp.disable_pipes():
    text_embeddings = np.array([nlp(text).vector for text in train["text"]])
text_embeddings.shape

(7613, 300)

In [94]:
with nlp.disable_pipes():
    keyword_embeddings = np.array([nlp(str(text)).vector for text in train["keyword"]])
keyword_embeddings.shape

(7613, 300)

In [92]:
with nlp.disable_pipes():
    location_embeddings = np.array([nlp(str(text)).vector for text in train["location"]])
location_embeddings.shape

(7613, 300)

In [112]:
embeddings = np.concatenate([text_embeddings, keyword_embeddings, location_embeddings], axis=1)
# embeddings = text_embeddings

In [113]:
embeddings.shape

(7613, 900)

In [114]:
X_train, X_val, y_train, y_val = train_test_split(embeddings, train["target"], test_size=0.1, random_state=1)

In [115]:
cv = StratifiedKFold(5, shuffle=True, random_state=1)
target = train["target"].to_numpy()

In [117]:
run_optimization = False

def objective(trial):
    
    params = {
        'grow_policy': trial.suggest_categorical('grow_policy', ["depthwise", "lossguide"]),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'gamma' : trial.suggest_float('gamma', 1e-9, 0.5),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'max_depth': trial.suggest_int('max_depth', 0, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 100.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 100.0, log=True),
        
    }
    
    params['booster'] = 'gbtree'
    params['objective'] = 'binary:logistic'
    params["device"] = "cpu"
    params["verbosity"] = 0
    params['tree_method'] = "hist"

    cv_splits = cv.split(embeddings, y=train["target"])
    cv_scores = list()

    for train_idx, val_idx in cv_splits:
    
        xgb_model = xgb.XGBClassifier(**params)

        X_train_fold, X_val_fold = embeddings[train_idx], embeddings[val_idx]
        y_train_fold, y_val_fold = target[train_idx], target[val_idx]

        xgb_model.fit(X_train_fold, y_train_fold)

        y_val_pred = xgb_model.predict(X_val_fold)
        cv_scores.append(f1_score(train["target"].iloc[val_idx], y_val_pred))

    cv_evaluation = np.mean(cv_scores)
    
    return cv_evaluation

if run_optimization:
    
    sqlite_db = "sqlite:///sqlite.db"
    study_name = "disaster_tweets_classification"
    study = optuna.create_study(storage=sqlite_db, study_name=study_name, 
                                sampler=TPESampler(n_startup_trials=20, multivariate=False, seed=0),
                                direction="maximize", load_if_exists=True)

    study.optimize(objective, n_trials=100)
    best_cls_params = study.best_params
    best_value = study.best_value

else:
    
    best_value = 0.7543939986952639
    best_cls_params = {'grow_policy': 'depthwise', 
                       'n_estimators': 599, 
                       'learning_rate': 0.028803128737724382, 
                       'gamma': 0.22688396307743608, 
                       'subsample': 0.47210935600035175, 
                       'colsample_bytree': 0.6575932611006123, 
                       'max_depth': 6, 
                       'min_child_weight': 3, 
                       'reg_lambda': 1.690636802393145e-08, 
                       'reg_alpha': 3.50367253905031}

print(f"best optmized accuracy: {best_value:0.5f}")
print(f"best hyperparameters: {best_cls_params}")

best_cls_params['objective'] = 'binary:logistic'
best_cls_params['tree_method'] = "hist"
best_cls_params["device"] = "cpu"
best_cls_params["verbosity"] = 0

best optmized accuracy: 0.75439
best hyperparameters: {'grow_policy': 'depthwise', 'n_estimators': 599, 'learning_rate': 0.028803128737724382, 'gamma': 0.22688396307743608, 'subsample': 0.47210935600035175, 'colsample_bytree': 0.6575932611006123, 'max_depth': 6, 'min_child_weight': 3, 'reg_lambda': 1.690636802393145e-08, 'reg_alpha': 3.50367253905031}


In [118]:
xgb_model = xgb.XGBClassifier(**best_cls_params)
xgb_model.fit(X_train, y_train)

In [119]:
f1_score(y_val, xgb_model.predict(X_val))

0.7416107382550335

In [123]:
with nlp.disable_pipes():
    text_embeddings = np.array([nlp(text).vector for text in test["text"]])
    keyword_embeddings = np.array([nlp(str(text)).vector for text in test["keyword"]])
    location_embeddings = np.array([nlp(str(text)).vector for text in test["location"]])

In [127]:
print(text_embeddings.shape)
print(keyword_embeddings.shape)
print(location_embeddings.shape)

(3263, 300)
(3263, 300)
(3263, 300)


In [124]:
test_embeddings = np.concatenate([text_embeddings, keyword_embeddings, location_embeddings], axis=1)

In [125]:
submission = pd.concat([test["id"], pd.Series(xgb_model.predict(test_embeddings))], axis=1)
submission.columns = ["id", "target"]

In [126]:
submission.to_csv("data/submission.csv", index=False)