# Real or Not real

Predict which disaster tweet is real or not

In [39]:
import pandas as pd
import os
import spacy
from spacy.util import minibatch, compounding
import random

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Utility Functions

In [34]:
# Evaluates model and return F1 score.
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 0.0  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if label == "NEGATIVE":
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.0
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

# Prep dataset and split in a train and test set
def prep_data(df_train,split=0.7):
    # Create tuple of values instead of dataframe
    subset =df_train[['text','target']]
    train_data = [tuple(x) for x in subset.to_numpy()]
    
    # Shuffle Data
    random.shuffle(train_data)
    
    # Seperate text and label
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]
    
    # 70%-30% Train-test split
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

## Load Data and model

In [2]:
nlp = spacy.load('en_core_web_lg')

In [4]:
# Read tweets
df_train = pd.read_csv('train.csv')

In [9]:
print('Total number of rows in train set:' + str(df_train.shape[0]))
df_train.head()

Total number of rows in train set:7613


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [10]:
df_train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Classification with Spacy TextCategorizer

In [38]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe(
        "textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"}
    )
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe("textcat")

In [37]:
# add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [41]:
 # load the IMDB dataset
print("Loading Tweets...")
(train_texts, train_cats), (dev_texts, dev_cats) = prep_data(df_train)

Loading Tweets...


In [42]:
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

In [44]:
init_tok2vec = None

In [45]:
# get names of other pipes to disable them during training
pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    if init_tok2vec is not None:
        with init_tok2vec.open("rb") as file_:
            textcat.model.tok2vec.from_bytes(file_.read())
    print("Training the model...")
    print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(100):
        losses = {}
        # batch up the examples using spaCy's minibatch
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print(
            "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                losses["textcat"],
                scores["textcat_p"],
                scores["textcat_r"],
                scores["textcat_f"],
            )
        )

Training the model...
LOSS 	  P  	  R  	  F  
10.853	0.803	0.732	0.766
0.845	0.800	0.749	0.774
0.188	0.781	0.745	0.762
0.058	0.773	0.755	0.764
0.026	0.769	0.761	0.765
0.012	0.753	0.755	0.754
0.008	0.750	0.752	0.751
0.008	0.742	0.748	0.745
0.006	0.737	0.744	0.740
0.006	0.742	0.733	0.738
0.004	0.735	0.741	0.738
0.004	0.738	0.737	0.738
0.004	0.743	0.738	0.741
0.003	0.735	0.738	0.737
0.004	0.729	0.733	0.731
0.003	0.730	0.732	0.731
0.003	0.732	0.732	0.732
0.003	0.729	0.726	0.727
0.002	0.726	0.722	0.724
0.003	0.725	0.713	0.719
0.002	0.730	0.717	0.723
0.002	0.731	0.712	0.721
0.003	0.735	0.720	0.727
0.004	0.740	0.723	0.731
0.003	0.734	0.724	0.729
0.002	0.736	0.724	0.730
0.002	0.732	0.722	0.727
0.002	0.730	0.719	0.724
0.002	0.733	0.717	0.725
0.003	0.734	0.715	0.725
0.003	0.738	0.717	0.727
0.002	0.741	0.720	0.730
0.002	0.738	0.719	0.728
0.002	0.736	0.723	0.729
0.002	0.732	0.725	0.728
0.002	0.732	0.715	0.724
0.002	0.735	0.714	0.725
0.002	0.736	0.713	0.725
0.002	0.732	0.713	0.723
0.002	0.733	0.708

In [46]:
with nlp.use_params(optimizer.averages):
    nlp.to_disk('first_model')

## Test the model

In [47]:
df_test = pd.read_csv('test.csv')

In [48]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [54]:
# Tokenize text and use spacy model to make prediction
def calculate_target(text):
    doc = nlp(text)
    
    # Define predicted value
    pred = 0
    if doc.cats['POSITIVE']>0.5:
        pred =1
    
    return pred

In [56]:
df_test['target'] = df_test['text'].apply(lambda x: calculate_target(x))

In [64]:
df_test.head(100)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,,,We're shaking...It's an earthquake,0
6,21,,,They'd probably still show more life than Arse...,0
7,22,,,Hey! How are you?,0
8,27,,,What a nice hat?,0
9,29,,,Fuck off!,0


In [75]:
# Write output
df_submission = df_test[['id','target']]