In [None]:
import pandas as pd
import numpy as np
import torch

In [None]:
# dataset WNUT 2016
# data from https://autonlp.ai/datasets/wnut-2016
wnut_2016_train = pd.read_csv("./NER_data/WNUT 2016 train.txt", sep="\t", header=None, names=["words", "labels"])
wnut_2016_test = pd.read_csv("./NER_data/WNUT 2016 test.txt", sep="\t", header=None, names=["words", "labels"])

In [None]:
wnut_2016 = pd.concat([wnut_2016_train, wnut_2016_test], axis=0)
wnut_2016.head()

Unnamed: 0,words,labels
0,@SammieLynnsMom,O
1,@tg10781,O
2,they,O
3,will,O
4,be,O


In [None]:
wnut_2016.head(20)

Unnamed: 0,words,labels
0,@SammieLynnsMom,O
1,@tg10781,O
2,they,O
3,will,O
4,be,O
5,all,O
6,done,O
7,by,O
8,Sunday,O
9,trust,O


In [None]:
len(wnut_2016)

86401

In [None]:
wnut_2016["labels"].unique()

array(['O', 'B-geo-loc', 'B-facility', 'I-facility', 'B-movie', 'I-movie',
       'B-company', 'B-product', 'B-person', 'B-sportsteam',
       'I-sportsteam', 'I-product', 'B-other', 'I-other', 'I-company',
       'I-person', 'I-geo-loc', 'B-tvshow', 'B-musicartist',
       'I-musicartist', 'I-tvshow'], dtype=object)

In [None]:
# adding a sentence_id-column
m = wnut_2016["words"].str.contains("@").cumsum()
wnut_2016["sentence_id"] = wnut_2016.groupby(m).ngroup() + 0

wnut_2016.head()

Unnamed: 0,words,labels,sentence_id
0,@SammieLynnsMom,O,0.0
1,@tg10781,O,1.0
2,they,O,1.0
3,will,O,1.0
4,be,O,1.0


In [None]:
labels_2016 = wnut_2016["labels"].unique()
print(labels_2016)

['O' 'B-geo-loc' 'B-facility' 'I-facility' 'B-movie' 'I-movie' 'B-company'
 'B-product' 'B-person' 'B-sportsteam' 'I-sportsteam' 'I-product'
 'B-other' 'I-other' 'I-company' 'I-person' 'I-geo-loc' 'B-tvshow'
 'B-musicartist' 'I-musicartist' 'I-tvshow']


In [None]:
#add sentence-ID column, increment at every "@-symbol"
# ner_csv.loc[0, "sentence-ID"] = 1
# for row in ner_csv.iterrows():
#     if "@" in ner_csv.loc[row, "words"]:
#         ner_csv.loc[row, "sentence-ID"] = ner_csv.loc[row-1, "words"] +1
    #else:
      #  ner_csv["words"][row] = ner_csv["words"][row-1]

In [None]:
# loading WNUT 2017 dataset
# https://github.com/juand-r/entity-recognition-datasets/blob/master/data/WNUT17/CONLL-format/data/train/wnut17train.conll
#wnut_2017 = pd.read_csv("./wnut17train.conll")

def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = '\t', header = None, keep_default_na = False,
                    names = ['TOKEN', 'POS', 'CHUNK', 'NE'],
                    quoting = 3, skip_blank_lines = False)
    df['SENTENCE'] = (df.TOKEN == '').cumsum()
    return df[df.TOKEN != '']

wnut_2017_train = read_conll("./NER_data/wnut17train.conll")
wnut_2017_test = read_conll("./NER_data/wnut17test.conll")

In [None]:
wnut_2017 = pd.concat([wnut_2017_train, wnut_2017_test], axis=0)
wnut_2017 = wnut_2017[["TOKEN", "POS", "SENTENCE"]]
wnut_2017 = wnut_2017.rename(columns={"TOKEN" : "words", "POS" : "labels", "SENTENCE" : "sentence_id"})

In [None]:
labels_2017 = wnut_2017["labels"].unique()
print(labels_2017)

['O' 'B-location' 'I-location' 'B-group' 'B-corporation' 'B-person'
 'B-creative-work' 'B-product' 'I-person' 'I-creative-work'
 'I-corporation' 'I-group' 'I-product']


In [None]:
wnut_2017.head()

Unnamed: 0,words,labels,sentence_id
0,@paulwalk,O,0
1,It,O,0
2,'s,O,0
3,the,O,0
4,view,O,0


In [None]:
wnut_all = pd.concat([wnut_2016, wnut_2017], axis=0)
wnut_all.head()

Unnamed: 0,words,labels,sentence_id
0,@SammieLynnsMom,O,0.0
1,@tg10781,O,1.0
2,they,O,1.0
3,will,O,1.0
4,be,O,1.0


In [None]:
wnut_all = wnut_all.dropna()

In [None]:
len(wnut_all)

172503

In [None]:
wnut_all.tail()

Unnamed: 0,words,labels,sentence_id
66118,39,O,3393
66119,GMT+0000,O,3393
66120,(,O,3393
66121,UTC,O,3393
66122,),O,3393


In [None]:
# read in the synthetically created data
#df_syn = pd.read_csv("./NER_data/synthetic_tweet_data3.csv")
df_syn = pd.read_csv("/content/synthetic_tweet_data3.csv")
df_syn.head()

Unnamed: 0,token,label,sentence_id
0,Is,O,0
1,it,O,0
2,just,O,0
3,me,O,0
4,or,O,0


In [None]:
df_syn["label"].unique()

array(['O', 'B-PRODUCT', 'B-COMPANY', 'B-PERSON'], dtype=object)

In [None]:
df_syn = df_syn.rename(columns={"token" : "words", "label" : "labels"})
df_syn["labels"] = df_syn["labels"].replace({"B-PRODUCT" : "B-product", "B-COMPANY" : "B-company", "B-PERSON" : "B-person"})

In [None]:
df_syn.head()

Unnamed: 0,words,labels,sentence_id
0,Is,O,0
1,it,O,0
2,just,O,0
3,me,O,0
4,or,O,0


In [None]:
# loading tweetner17 dataset from huggingface
# https://huggingface.co/datasets/tner/tweetner7
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
print(ds)

DatasetDict({
    test_2020: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 576
    })
    test_2021: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 2807
    })
    validation_2020: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 576
    })
    validation_2021: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 310
    })
    train_2020: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 4616
    })
    train_2021: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 2495
    })
    train_all: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 7111
    })
    validation_random: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 576
    })
    train_random: Dataset({
        features: ['tokens', 'tags', 'id', 'date'],
        num_rows: 4616
    })
  

In [None]:
tweetner7_train = ds["train_2020"].to_pandas()
tweetner7_test = ds["test_2020"].to_pandas()

In [None]:
tweetner7_train.head()

Unnamed: 0,tokens,tags,id,date
0,"[Morning, 5km, run, with, {{USERNAME}}, for, b...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 2, 14...",1183344337016381440,2019-10-13
1,"[President, Trump, Arrives, at, UFC, 244, in, ...","[5, 12, 14, 14, 0, 7, 14, 4, 11, 14, 14, 14, 1...",1190961319538765824,2019-11-03
2,"["", I, 've, been, in, law, enforcement, for, 2...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1267032593339486209,2020-05-31
3,"[I, got, mine, yesterday, !, ****, Doctors, sa...","[14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 1...",1180717545935925248,2019-10-06
4,"[Mayo, Breast, Cancer, Vaccine, Could, Be, Ava...","[6, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14...",1183251744601587712,2019-10-13


## bert

In [None]:
# tutorial with bert
# https://github.com/karndeepsingh/Named-Entity-Recognition/blob/main/NAMED%20ENTITY%20RECOGNITION.ipynb

In [None]:
#pip install simpletransformers

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df_syn["sentence_id"] = LabelEncoder().fit_transform(df_syn["sentence_id"])

In [None]:
X = df_syn[["sentence_id","words"]]
Y = df_syn["labels"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
x_train.head()

Unnamed: 0,sentence_id,words
14220,1014,Honestly
91125,6504,be
426021,30483,I
104037,7420,new
349990,25017,great


In [None]:
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [None]:
train_data

Unnamed: 0,sentence_id,words,labels
14220,1014,Honestly,O
91125,6504,be,O
426021,30483,I,O
104037,7420,new,O
349990,25017,great,O
...,...,...,...
301789,21562,with,O
175253,12528,be,O
464050,33210,overrated,O
161667,11543,Jobs,I-person


In [None]:
#!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.39.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets-

In [None]:
from simpletransformers.ner import NERModel,NERArgs

In [None]:
label = df_syn["labels"].unique().tolist()
label

['O', 'B-product', 'B-company', 'B-person', 'I-person', 'I-product']

In [None]:
args = NERArgs()
# epochs = times the training data is ran through the network
args.num_train_epochs = 3
args.learning_rate = 1e-4
args.overwrite_output_dir = True
args.train_batch_size = 32
args.eval_batch_size = 32

In [None]:
model = NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda=False)
# set use_cuda to true if gpu is available

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/2 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1054 [00:00<?, ?it/s]

In [None]:
result

{'eval_loss': 3.4305521306537448,
 'precision': 0.0015437152079337607,
 'recall': 0.017818574514038878,
 'f1_score': 0.002841275991217874}

In [None]:
prediction, model_output = model.predict(["Tesla just came out with a new model, the Tesla-7b, which is electric. I love that!"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
prediction

[[{'Tesla': 'B-product'},
  {'just': 'B-product'},
  {'came': 'I-product'},
  {'out': 'I-product'},
  {'with': 'B-product'},
  {'a': 'B-person'},
  {'new': 'I-product'},
  {'model,': 'B-company'},
  {'the': 'B-person'},
  {'Tesla-7b,': 'O'},
  {'which': 'I-product'},
  {'is': 'I-product'},
  {'electric.': 'B-product'},
  {'I': 'B-person'},
  {'love': 'I-product'},
  {'that!': 'I-product'}]]

In [2]:
# trying out model that is RoBERTa version trained with tweetner (includes the wanted entities)
# tner/roberta-base-tweetner7-all

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("tner/roberta-base-tweetner7-all")

model = AutoModelForSequenceClassification.from_pretrained("tner/roberta-base-tweetner7-all")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at tner/roberta-base-tweetner7-all and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
#pip install tner

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

model_name = "tner/roberta-base-tweetner7-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

text = "Apple just launched the new iPhone, and Microsoft released a new version of Windows."

entities = ner_pipeline(text)
print(entities)



[{'entity': 'B-corporation', 'score': 0.5353077, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'B-product', 'score': 0.65631783, 'index': 6, 'word': 'ĠiPhone', 'start': 28, 'end': 34}, {'entity': 'B-corporation', 'score': 0.7064, 'index': 9, 'word': 'ĠMicrosoft', 'start': 40, 'end': 49}, {'entity': 'B-product', 'score': 0.62058973, 'index': 15, 'word': 'ĠWindows', 'start': 76, 'end': 83}]


In [5]:
filtered_entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
print(filtered_entities)

[{'word': 'Apple', 'entity': 'B-corporation'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'ĠMicrosoft', 'entity': 'B-corporation'}, {'word': 'ĠWindows', 'entity': 'B-product'}]


In [6]:
# testing NER on Brand Sentiment Analysis data
import pandas as pd
df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")
df_brd_sa.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [7]:
# applying pre-processing function before model application from https://huggingface.co/tner/roberta-base-tweetner7-all
import re
from urlextract import URLExtract

extractor = URLExtract()

def format_tweet(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [8]:
df_brd_sa_sm = df_brd_sa[:100]

df_brd_sa_sm.dropna()

df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brd_sa_sm["tweet_text"] = df_brd_sa_sm.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)


In [9]:
def ner(txt):
    entities = ner_pipeline(txt)
    entities = [{"word": entity["word"], "entity": entity["entity"]} for entity in entities]
    return entities

In [14]:
df_brd_sa["entities"] = df_brd_sa.apply(lambda x: ner(str(x["tweet_text"])), axis=1)

In [None]:
df_brd_sa

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,entities
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."
1,@jessedee Know about {@fludapp@} ? Awesome iPa...,iPad or iPhone App,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,"[{'word': '@', 'entity': 'B-person'}, {'word':..."
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,"[{'word': 'Ġfestival', 'entity': 'B-event'}, {..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,"[{'word': 'xt', 'entity': 'I-event'}, {'word':..."
...,...,...,...,...
95,GSD&amp;M &amp; Google's Industry Party Tonigh...,,No emotion toward brand or product,"[{'word': 'ĠGoogle', 'entity': 'B-corporation'..."
96,New buzz? &quot;@mention Google to Launch Majo...,,No emotion toward brand or product,"[{'word': 'ment', 'entity': 'I-corporation'}, ..."
97,Headline: &quot;#iPad 2 is the Must-Have Gadge...,iPad,Positive emotion,"[{'word': 'i', 'entity': 'B-product'}, {'word'..."
98,.@mention &quot;Google launched checkins a mon...,Google,Positive emotion,"[{'word': '.', 'entity': 'B-person'}, {'word':..."


In [None]:
reslist = list(df_brd_sa[["tweet_text", "entities"]].values)
print(reslist)

[array(['.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.',
       list([{'word': '.', 'entity': 'B-person'}, {'word': '@', 'entity': 'I-person'}, {'word': 'w', 'entity': 'I-person'}, {'word': 'esley', 'entity': 'I-person'}, {'word': '83', 'entity': 'I-person'}, {'word': 'Ġ3', 'entity': 'B-product'}, {'word': 'G', 'entity': 'I-product'}, {'word': 'ĠiPhone', 'entity': 'B-product'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'ISE', 'entity': 'I-location'}, {'word': '_', 'entity': 'I-location'}, {'word': 'Austin', 'entity': 'I-location'}, {'word': 'Ġ#', 'entity': 'B-location'}, {'word': 'S', 'entity': 'I-location'}, {'word': 'X', 'entity': 'I-location'}, {'word': 'SW', 'entity': 'I-location'}])],
      dtype=object), array(["@jessedee Know about {@fludapp@} ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",
       list([{'word': '@', 'entity': '

In [None]:
# json allows entities-column to be stored in the CSV
import json
df_brd_sa["entities"] = df_brd_sa["entities"].apply(json.dumps)

In [16]:
df_brd_sa.to_csv("./data after NER.csv")