# Zero Shot Text Classification Model
**Model Description** - Bart with a classification head trained on MNLI.

Sequences are posed as NLI premises and topic labels are turned into premises, i.e. business -> This text is about business.

In [1]:
import eland as ed
from eland.conftest import *
import pandas as pd
import seaborn as sns
import preprocessor as prep
import matplotlib.pyplot as plt
import string
from tqdm.auto import tqdm
from transformers import BartForSequenceClassification, BartTokenizer

pd.set_option('display.max_colwidth', -1)

## Importing the Data from Elasticsearch

In [14]:
ed_df = ed.DataFrame('localhost', 'twitter', columns=['full_text_trans'])

# defining the full-text query we need: Retrieving records for full_text_processed with the condition is_retweet=False and is_quote_status=False
query_unique = {
    "bool": {
        "must": {
            "term":{"is_retweet":"false"},
        },
        "filter": {
            "term":{"is_quote_status":"false"}
        },
    }
}
# using full-text search capabilities with Eland:
df_ed = ed_df.es_query(query_unique)
df_tweets = df_ed.to_pandas()
df_tweets = df_tweets.dropna()

In [15]:
df_tweets.shape

(108220, 1)

## Basic Tweet Preprocessing
- Remove URLs and reserved words (RTs)
- Remove # and @ symbols
- Remove tweets less than 4 tokens in length


In [16]:
## Set options for the tweet-preprocessor
prep.set_options(prep.OPT.URL, prep.OPT.RESERVED, prep.OPT.EMOJI, prep.OPT.SMILEY)

## Clean text and remove #,@ symbols
def clean_tweet(text):
    text = prep.clean(text)
    table = str.maketrans('','','#@')
    return text.translate(table)

In [17]:
df_tweets['full_text_processed'] = df_tweets['full_text_trans'].apply(lambda x: clean_tweet(x))

In [18]:
df_tweets['length'] = df_tweets['full_text_processed'].apply(lambda x: len([w for w in x.split()]))
df_tweets = df_tweets[df_tweets.length != 0]

In [19]:
df_tweets.shape

(106937, 3)

In [20]:
df_tweets.head()

Unnamed: 0,full_text_trans,full_text_processed,length
1263793772833505280,"#Covid19 #MigrantLabourers #Amphan and now the #planecrash near Karachi. This Eid turning out to be a perfect storm of tragedies. Also shows how nothing matters and people's deaths are mere statistics, one set of numbers replaces another. Human race to become more psychopathic.","Covid19 MigrantLabourers Amphan and now the planecrash near Karachi. This Eid turning out to be a perfect storm of tragedies. Also shows how nothing matters and people's deaths are mere statistics, one set of numbers replaces another. Human race to become more psychopathic.",43
1263793968963149824,"22 parties call upon Centre, seek declaration of cyclone Amphan as natural calamity | India News \nhttps://t.co/LykdBolAal","22 parties call upon Centre, seek declaration of cyclone Amphan as natural calamity | India News",16
1263794044691415040,@narendramodi Sir\nIt is nice aspect of u visted Amphan cyclone affected states of odissa/W Bengal and expressed problems of people.\nUr moral support to them will remembered ever.\n Praying God to be with people affected for thier up coming early.the people will recognize ur visit and help.\n🙏,narendramodi Sir It is nice aspect of u visted Amphan cyclone affected states of odissa/W Bengal and expressed problems of people. Ur moral support to them will remembered ever. Praying God to be with people affected for thier up coming early.the people will recognize ur visit and help.,48
1263793570474934272,"The famous South Asian out-of-the-box thinking kicking in as one tries to talk to people in Amphan-ravaged areas with zero or poor connectivity . No power, no internet, no phone connection but still one tries and sometimes succeed.","The famous South Asian out-of-the-box thinking kicking in as one tries to talk to people in Amphan-ravaged areas with zero or poor connectivity . No power, no internet, no phone connection but still one tries and sometimes succeed.",38
1263793365843431424,"News: Cyclone Amphan (satellite image shown) impacts eastern India and Bangladesh, killing over one hundred people and forcing the evacuation of more than four million others https://t.co/I7sFGNx738","News: Cyclone Amphan (satellite image shown) impacts eastern India and Bangladesh, killing over one hundred people and forcing the evacuation of more than four million others",26


## GPU approach using Transformers Pipeline

In [21]:
from transformers import pipeline

In [49]:
classifier = pipeline('zero-shot-classification', device=0)

loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/facebook/bart-large-mnli/config.json from cache at /home/ubuntu/.cache/torch/transformers/a35b79dc26c2f371a0e19eae44d91c0a0281a5db09044517d2675703791ee3c5.746d7ef19ade685cd3ee03f131a96fab513947c26179546289ddf02a6ac683ce
Model config BartConfig {
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std":

In [22]:
TERMS = list({'sympathy', 'criticism', 'hope', 'job', 'relief measures', 'compensation', 'evacuation', 'ecosystem', 'government', 
              'corruption', 'news updates', 'volunteers', 'donation', 'cellular network', 'housing', 'farm', 'utilities', 
              'water supply', 'power supply', 'food supply', 'medical assistance', 'coronavirus', 'petition', 'poverty'})

In [51]:
%timeit classifier(df_tweets['full_text_processed'][0], TERMS, multi_class=True)

98.8 ms ± 167 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
'''Method to get the labels for a tweet based on threshold specified'''
def get_all_labels(x, terms=TERMS):
    # Run model
    result = classifier(x, terms, multi_class=True)
    topics = []
    for label, score in zip(result['labels'], result['scores']):
        topics.append((label, np.round(score,2)))
    return topics

In [None]:
tqdm.pandas()
df_tweets['full_text_processed'].dropna().progress_apply(lambda x: get_all_labels(x, TERMS)).to_json('../models/zstc_labels.json', orient='index')

HBox(children=(FloatProgress(value=0.0, max=106953.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed