# Data Analysis and Processing for Sentiment Analysis

## 1.6 million random tweets

In [2]:
import pandas as pd
# dataset of 1.6 million random tweets
# https://www.kaggle.com/datasets/i191796majid/tweets
df_rd_tweets = pd.read_csv("./analysis_data/1.6 million random tweets/train.csv/train.csv")
df_rd_tweets.head()

In [3]:
df_rd_tweets["sentiment"].value_counts()

In [4]:
words = ["Tesla", "Apple", "iPhone", "MacBook", "AirPods", "Microsoft", "Windows", "Meta", "Twitter", "X", "Facebook", "Instagram", "Elon Musk", "Bill Gates", "Steve Jobs", "Tim Cook"]

In [5]:
d_words = dict.fromkeys(words, 0)

for key in d_words.keys():
    d_words[key] = df_rd_tweets["tweet"].str.contains(key).sum() 

print(d_words)

## Reddit r/Technology

In [6]:
# dataset of comments from the subreddit r/Technology
# https://www.kaggle.com/datasets/thedevastator/uncovering-technology-insights-through-reddit-di
df_rtechnology = pd.read_csv("./analysis_data/reddit technology.csv")
df_rtechnology.head()

In [7]:
len(df_rtechnology)

In [8]:
df_rtechnology["score"].hist()

In [9]:
len(df_rtechnology[df_rtechnology["score"]<1000])/len(df_rtechnology)

In [10]:
d_words2 = dict.fromkeys(words, 0)

for key in d_words2.keys():
    d_words2[key] = df_rtechnology["title"].str.contains(key).sum() 

print(d_words2)

## Big Tech Companies - Tweet Sentiment

In [11]:
# Big Tech Companies - Tweet Sentiment 
# https://www.kaggle.com/datasets/wjia26/big-tech-companies-tweet-sentiment/data
import pandas as pd

df_tweets_bigtech = pd.read_csv("./analysis_data/Big Tech Companies - Tweet Sentiment/Bigtech - 12-07-2020 till 19-09-2020/Bigtech - 12-07-2020 till 19-09-2020.csv")
df_tweets_bigtech.dropna(inplace=True)
df_tweets_bigtech.drop_duplicates(inplace=True)
df_tweets_bigtech.head()

In [12]:
len(df_tweets_bigtech)

In [13]:
df_tweets_bigtech["created_at"].value_counts()

In [14]:
min_polarity = df_tweets_bigtech["polarity"].min()
max_polarity = df_tweets_bigtech["polarity"].max()
print(min_polarity, max_polarity)

In [15]:
# plotting polarity to figure out tresholds for sentiment column
df_tweets_bigtech["polarity"].hist()

In [16]:
# 0 negative
# 1 neutral
# 2 positive
# function converting polarity to sentiment
def polarity_to_sentiment(polarity):
    if -0.2 <= polarity <= 0.2:
        return 1
    if polarity > 0.2:
        return 2
    else:
        return 0
    
df_tweets_bigtech["labels"] = df_tweets_bigtech["polarity"].apply(polarity_to_sentiment)

In [17]:
df_tweets_bigtech["labels"].value_counts()

In [18]:
bigtech_negative = df_tweets_bigtech[df_tweets_bigtech["labels"]==0].sample(4700, random_state=42)

bigtech_neutral = df_tweets_bigtech[df_tweets_bigtech["labels"]==1].sample(4700, random_state=42)

bigtech_positive = df_tweets_bigtech[df_tweets_bigtech["labels"]==2].sample(4700, random_state=42)

# .sample to shuffle the rows
df_tweets_bigtech_sample = pd.concat([bigtech_negative, bigtech_neutral, bigtech_positive], axis=0, ignore_index=True).sample(frac=1, random_state=42).reset_index()
df_tweets_bigtech_sample = df_tweets_bigtech_sample[["text", "labels"]]
df_tweets_bigtech_sample.head()

In [19]:
len(df_tweets_bigtech_sample)

In [20]:
df_tweets_bigtech_sample.to_json("tweets_bigtech_sample.json", orient="records")

In [21]:
# creating data for final application that does not overlap with training data

all_texts = df_tweets_bigtech["text"].unique()
corpus_texts = set(df_tweets_bigtech_sample["text"].unique())
remaining_texts = [x for x in all_texts if x not in corpus_texts]

remaining_df = df_tweets_bigtech[df_tweets_bigtech["text"].isin(remaining_texts)]

# .sample to shuffle the rows
df_tweets_bigtech_app = remaining_df.sample(10000, random_state=42).reset_index()

df_tweets_bigtech_app["tokens"] = df_tweets_bigtech_app["text"].apply(lambda text: text.split())
df_tweets_bigtech_app = df_tweets_bigtech_app[["text", "tokens", "labels"]]

df_tweets_bigtech_app.head()

In [22]:
df_tweets_bigtech_app.to_json("tweets_bigtech_10k_application.json", orient="records")

In [23]:
d_words3 = dict.fromkeys(words, 0)

for key in d_words3.keys():
    d_words3[key] = df_tweets_bigtech["text"].str.contains(key).sum() 

print(d_words3)

## Brand Sentiment Analysis

In [24]:
# Brand Sentiment Analysis Dataset 
# https://www.kaggle.com/datasets/tusharpaul2001/brand-sentiment-analysis-dataset?select=Dataset+-+Train.csv
df_brd_sa = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Train.csv")
df_brd_sa = df_brd_sa[["tweet_text", "is_there_an_emotion_directed_at_a_brand_or_product"]]
df_brd_sa.dropna(inplace=True)
df_brd_sa.drop_duplicates(inplace=True)
df_brd_sa.head()

In [25]:
len(df_brd_sa)

In [26]:
df_brd_sa_test = pd.read_csv("./analysis_data/Brand Sentiment Analysis Dataset/Dataset - Test.csv")
len(df_brd_sa_test)

In [27]:
# test-document has no labels
df_brd_sa_test.head()

In [28]:
import re
from urlextract import URLExtract

extractor = URLExtract()

def format_tweet(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    tweet = re.sub(r"\b(\s*)(@[\S]+)\b", r'\1{\2@}', tweet)
    return tweet

In [29]:
#df_brd_sa["tweet_text"] = df_brd_sa.apply(lambda x: format_tweet(str(x["tweet_text"])), axis=1)

In [30]:
df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].value_counts()

In [31]:
# converting labels to numeric
df_brd_sa["labels"] = df_brd_sa["is_there_an_emotion_directed_at_a_brand_or_product"].replace({"Negative emotion" : 0, "Positive emotion" : 2, "No emotion toward brand or product" : 1, "I can't tell" : 1})

In [32]:
df_brd_sa.rename(columns={"tweet_text" : "text"}, inplace=True)
df_brd_sa = df_brd_sa[["text", "labels"]]

In [33]:
df_brd_sa["labels"].value_counts()

In [34]:
# creating sample with equal sentiment distribution
brd_sa_negative = df_brd_sa[df_brd_sa["labels"]==0].sample(500, random_state=42)

brd_sa_neutral = df_brd_sa[df_brd_sa["labels"]==1].sample(500, random_state=42)

brd_sa_positive = df_brd_sa[df_brd_sa["labels"]==2].sample(500, random_state=42)

# .sample to shuffle the rows
df_brd_sa_sample = pd.concat([brd_sa_negative, brd_sa_neutral, brd_sa_positive], axis=0, ignore_index=True).sample(frac=1, random_state=42).reset_index()
df_brd_sa_sample = df_brd_sa_sample[["text", "labels"]]
df_brd_sa_sample.head()

In [35]:
len(df_brd_sa_sample)

In [36]:
#df_brd_sa.to_csv("./SA_data/brand_sentiment_analysis_preprocessed.csv", index=False)

In [37]:
d_words4 = dict.fromkeys(words, 0)

for key in d_words4.keys():
    d_words4[key] = df_brd_sa["text"].str.contains(key).sum() 

print(d_words4)

## Tweets Sentiment Classification

In [38]:
# Tweets Sentiment Classification 
# https://www.kaggle.com/datasets/bhrt97/tweets-sentiment-classification?resource=download
df_tweets_sc = pd.read_csv("./SA_data/Tweets Sentiment Classification/train.csv", index_col="id")

# test-dataset has no labels 
#df_tweets_sc_ts = pd.read_csv("./SA_data/Tweets Sentiment Classification/test.csv", index_col="id")
#df_tweets_sc = pd.concat([df_tweets_sc_tr, df_tweets_sc_ts])
df_tweets_sc.head()

In [39]:
df_tweets_sc.dropna(inplace=True)
len(df_tweets_sc)

In [40]:
df_tweets_sc["label"] = df_tweets_sc["label"].astype(int)

# replacing 1 with 2 because in other data 2 = positive sentiment
df_tweets_sc["label"] = df_tweets_sc["label"].replace({1:2})
df_tweets_sc.head()

In [41]:
df_tweets_sc["label"].value_counts()

In [42]:
df_tweets_sc["tweet"] = df_tweets_sc.apply(lambda x: format_tweet(str(x["tweet"])), axis=1)

In [43]:
df_tweets_sc.head()

In [44]:
#df_tweets_sc.to_csv("./SA_data/Tweets Sentiment Classification/tweets_sentiment_classification_preprocessed.csv", index=False)

## Twitter US Airline Sentiment

In [45]:
# data of tweets directed at US airlines
# https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
import pandas as pd

df_airline = pd.read_csv("./SA_data/Twitter US Airline Sentiment.csv")
df_airline = df_airline[["text", "airline_sentiment"]]
df_airline.head()

In [46]:
print(len(df_airline))
df_airline.dropna(inplace=True)
df_airline.drop_duplicates(inplace=True)
print(len(df_airline))

In [47]:
df_airline["airline_sentiment"].value_counts()

In [48]:
# convert sentiment column to numeric structure
df_airline["airline_sentiment"].replace({"negative" : 0, "neutral" : 1, "positive" : 2}, inplace=True)
df_airline.rename(columns={"airline_sentiment" : "labels"}, inplace=True)
df_airline.head()

In [49]:
# creating sample with equal sentiment distribution
df_airline_negative = df_airline[df_airline["labels"]==0].sample(1667, random_state=42)
df_airline_neutral = df_airline[df_airline["labels"]==1].sample(1668, random_state=42)
df_airline_positive = df_airline[df_airline["labels"]==2].sample(1667, random_state=42)

# .sample to shuffle the rows
df_airline_sample = pd.concat([df_airline_negative, df_airline_neutral, df_airline_positive], axis=0, ignore_index=True).sample(frac=1, random_state=42).reset_index()
df_airline_sample = df_airline_sample[["text", "labels"]]
df_airline_sample.head()

In [50]:
len(df_airline_sample)

## Sentiment and Emotions labeled tweets - Dell

In [51]:
# Sentiment and Emotions labelled tweets - Dell 
# https://www.kaggle.com/datasets/ankitkumar2635/sentiment-and-emotions-of-tweets
import pandas as pd

df_dell = pd.read_csv("./SA_data/sentiment-emotion-labelled_Dell_tweets.csv")
df_dell.dropna(inplace=True)
df_dell.drop_duplicates(inplace=True)
df_dell.head()

In [52]:
len(df_dell)

In [53]:
df_dell = df_dell[["Datetime", "Text", "sentiment"]]
df_dell.head()

In [54]:
df_dell["sentiment"].value_counts()

In [55]:
df_dell["Datetime"]

In [56]:
import pandas as pd
import re
from collections import Counter

# counting most frequent mentions
df_dell["mentions"] = df_dell["Text"].apply(lambda x: re.findall(r'@\S+', str(x)) if pd.notna(x) else [])
all_mentions = [mention for mentions in df_dell["mentions"] for mention in mentions]
mention_counts = pd.Series(Counter(all_mentions))
mention_counts = mention_counts.sort_values(ascending=False)
print(mention_counts[:20])

In [57]:
# replacing urls and user mentions while keeping company mentions

mentions_to_keep = ["Dell", "Delltech", "Dellcares", "Dell,", "HP", "Microsoft", "Apple", "Logitech", "Google", "Lenovo", "Tesla", "Intel", "Alienware", "Emc"]

def format_tweet_new_keep(tweet):
    if not isinstance(tweet, str):  
        return ""

    tokens = tweet.split()
    tokens = [token[1:].capitalize() if token.startswith("@") and token[1:].capitalize() in mentions_to_keep else token for token in tokens]

    tweet = " ".join(tokens)

    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")

    tweet = re.sub(r'@\S+', "{{MENTION}}", tweet)

    return tweet

df_dell["text"] = df_dell.apply(lambda x: format_tweet_new_keep(str(x["Text"])), axis=1)
df_dell.head()

In [58]:
df_dell["tokens"] = df_dell["text"].apply(lambda x: x.split())
df_dell = df_dell[["Datetime", "text", "tokens", "sentiment"]]
df_dell.head()

In [59]:
df_dell.to_json("./SA_data/sentiment_dell_processed.json", orient="records")

## Creating a corpus from multiple Sentiment-datasets

In [60]:
# Tweets Big Tech, Brand Sentiment Analysis data and US Airline Sentiment all combined
df_sa_corpus = pd.concat([df_tweets_bigtech_sample, df_brd_sa_sample, df_airline_sample], ignore_index=True).sample(frac=1, random_state=42).reset_index()
df_sa_corpus = df_sa_corpus[["text", "labels"]]
df_sa_corpus.head()

In [61]:
print(len(df_sa_corpus))
df_sa_corpus.dropna(inplace=True)
df_sa_corpus.drop_duplicates(inplace=True)
print(len(df_sa_corpus))

In [62]:
df_sa_corpus["labels"].value_counts()

In [63]:
# replacing urls and user mentions 
def format_tweet_new(tweet):
    # mask web urls
    urls = extractor.find_urls(tweet)
    for url in urls:
        tweet = tweet.replace(url, "{{URL}}")
    # format twitter account
    # recognizes tokens including @ at any place
    tweet = re.sub(r'\S*@\S*', "{{MENTION}}", tweet)    
    return tweet

df_sa_corpus["text"] = df_sa_corpus.apply(lambda x: format_tweet_new(str(x["text"])), axis=1)
df_sa_corpus.head()

In [64]:
df_sa_corpus.to_json("./SA_data/sentiment_corpus.json", orient="records")

# Data Analysis and Processing for NER

## WNUT

In [65]:
# dataset WNUT 2016
# data from https://autonlp.ai/datasets/wnut-2016
import pandas as pd
wnut_2016_train = pd.read_csv("./NER_data/WNUT 2016 train.txt", sep="\t", header=None, names=["words", "labels"], skip_blank_lines=False)
wnut_2016_test = pd.read_csv("./NER_data/WNUT 2016 test.txt", sep="\t", header=None, names=["words", "labels"], skip_blank_lines=False)

In [66]:
wnut_2016 = pd.concat([wnut_2016_train, wnut_2016_test], axis=0)
wnut_2016.head()

In [67]:
wnut_2016["sentence_id"] = (pd.isna(wnut_2016.words)).cumsum()
wnut_2016.head(10)

In [68]:
wnut_2016.dropna(inplace=True)

In [69]:
# reworked version to fix list mismatches between columns
grouped_data = wnut_2016.groupby("sentence_id").agg({
    "words": list,  "labels": list}).reset_index()

grouped_data.rename(columns={"words": "tokens", "labels": "label_list"}, inplace=True)

wnut_2016_final = grouped_data[["tokens", "label_list"]]

wnut_2016_final["tokens_len"] = wnut_2016_final["tokens"].apply(len)
wnut_2016_final["labels_len"] = wnut_2016_final["label_list"].apply(len)
mismatched = wnut_2016_final[wnut_2016_final["tokens_len"] != wnut_2016_final["labels_len"]]

print(f"Rows with mismatched lengths: {len(mismatched)}")

In [70]:
len(wnut_2016_final)

In [71]:
# loading WNUT 2017 dataset
# https://github.com/juand-r/entity-recognition-datasets/blob/master/data/WNUT17/CONLL-format/data/train/wnut17train.conll

# function for creating a clean df from .conll-file
def read_conll(filename):
    df = pd.read_csv(filename,
                    sep = "\t", header = None, keep_default_na = False,
                    names = ["words", "labels", "chunk", "ne"],
                    quoting = 3, skip_blank_lines = False)
    df["sentence_id"] = (df.words == '').cumsum()
    return df[df.words != '']

wnut_2017_train = read_conll("./NER_data/wnut17train.conll")
wnut_2017_test = read_conll("./NER_data/wnut17test.conll")

In [72]:
wnut_2017 = pd.concat([wnut_2017_train, wnut_2017_test], axis=0)
wnut_2017 = wnut_2017[["words", "labels", "sentence_id"]]
wnut_2017.head()

In [73]:
# reworked version to fix list mismatches between columns
grouped_data = wnut_2017.groupby("sentence_id").agg({
    "words": list,  
    "labels": list 
}).reset_index()

grouped_data.rename(columns={"words": "tokens", "labels": "label_list"}, inplace=True)

wnut_2017_final = grouped_data[["tokens", "label_list"]]

# checking if there are mismatches in list length
wnut_2017_final["tokens_len"] = wnut_2017_final["tokens"].apply(len)
wnut_2017_final["labels_len"] = wnut_2017_final["label_list"].apply(len)
mismatched = wnut_2017_final[wnut_2017_final["tokens_len"] != wnut_2017_final["labels_len"]]
print(f"Rows with mismatched lengths: {len(mismatched)}")

In [74]:
len(wnut_2017_final)

In [75]:
wnut_2017_final["label_list"].explode().value_counts()

In [76]:
wnut_complete = pd.concat([wnut_2016_final, wnut_2017_final])
len(wnut_complete)

In [77]:
# ratio of tokens labeled as "Outside"
all_wnut_labels = wnut_complete["label_list"].explode().tolist()
outside_wnut = all_wnut_labels.count("O")
print(f"Percentange of labels that are O: {outside_wnut/len(all_wnut_labels)}")

In [78]:
wnut_complete["label_list"].explode().value_counts()

In [79]:
# changing "company"-tag to "corporation" to match TweetNER
wnut_complete["label_list"] = wnut_complete["label_list"].apply(
    lambda x: [item.replace("B-company", "B-corporation") if isinstance(item, str) else item for item in x]
)
wnut_complete["label_list"] = wnut_complete["label_list"].apply(
    lambda x: [item.replace("I-company", "I-corporation") if isinstance(item, str) else item for item in x]
)

#changing "geo-loc"-tag to "location to match"
wnut_complete["label_list"] = wnut_complete["label_list"].apply(
    lambda x: [item.replace("B-geo-loc", "B-location") if isinstance(item, str) else item for item in x]
)
wnut_complete["label_list"] = wnut_complete["label_list"].apply(
    lambda x: [item.replace("I-geo-loc", "I-location") if isinstance(item, str) else item for item in x]
)

In [80]:
wnut_complete["label_list"].explode().value_counts()

In [81]:
len(wnut_complete)

In [82]:
wnut_complete.head()

In [83]:
def convert_unwanted_entities(label_list):
    unwanted_entities = ["B-musicartist", "I-musicartist", "B-movie", "I-movie", "B-sportsteam", "I-sportsteam", 
                     "B-facility", "I-facility", "B-group", "I-group", "B-creative-work", "I-creative-work", 
                     "B-creative_work", "I-creative_work", "B-tvshow", "I-tvshow", "B-other", "I-other"]
    return ["O" if label in unwanted_entities else label for label in label_list]

wnut_complete["label_list"] = wnut_complete["label_list"].apply(convert_unwanted_entities)

In [84]:
# entity-dict with only the wanted entities 

entity_dict = {
    0: "B-corporation",
    1: "B-event",
    2: "B-location",
    3: "B-person",
    4: "B-product",
    5: "I-corporation",
    6: "I-event",
    7: "I-location",
    8: "I-person",
    9: "I-product",
    10: "O"
}

label_list = list(entity_dict.values())
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [85]:
# adding tag-column from label-column with self-defined tags
wnut_complete["tags"] = wnut_complete["label_list"].apply(lambda x: [label_to_id[i] for i in x])
wnut_complete.rename(columns={"label_list" : "labels"}, inplace=True)
wnut_complete.head()

In [86]:
# saving processed data as csv
wnut_complete.to_csv("./NER_data/wnut_complete_processed.csv", index=False)

## TweetNER7

In [87]:
# loading tweetner17 dataset from huggingface 
# https://huggingface.co/datasets/tner/tweetner7
from datasets import load_dataset

ds = load_dataset("tner/tweetner7")

In [88]:
tweetner7_train = ds["train_all"].to_pandas()
tweetner7_test21 = ds["test_2021"].to_pandas()
tweetner7_test20 = ds["test_2020"].to_pandas()
tweetner7_val20 = ds["validation_2020"].to_pandas()
tweetner7_val21 = ds["validation_2021"].to_pandas()
tweetner7_train.dropna(inplace=True)
tweetner7_test21.dropna(inplace=True)
tweetner7_test20.dropna(inplace=True)
tweetner7_val20.dropna(inplace=True)
tweetner7_val21.dropna(inplace=True)

In [89]:
import pandas as pd

tweetner7_all = pd.concat([tweetner7_train, tweetner7_test21, tweetner7_test20, tweetner7_val20, tweetner7_val21])
tweetner7_all = tweetner7_all[["tokens", "tags"]]
tweetner7_all.head()

In [None]:
# adding label-column from tags
entity_dict_tweetner = {
    0: "B-corporation",
    1: "B-creative_work",
    2: "B-event",
    3: "B-group",
    4: "B-location",
    5: "B-person",
    6: "B-product",
    7: "I-corporation",
    8: "I-creative_work",
    9: "I-event",
    10: "I-group",
    11: "I-location",
    12: "I-person",
    13: "I-product",
    14: "O"
}

label_list_tweetner = list(entity_dict_tweetner.values())
label_to_id_tweetner = {label: i for i, label in enumerate(label_list_tweetner)}
id_to_label_tweetner = {i: label for label, i in label_to_id_tweetner.items()}

tweetner7_all["labels"] = tweetner7_all["tags"].apply(lambda x: [id_to_label_tweetner[i] for i in x])
tweetner7_all.head()

In [91]:
len(tweetner7_all)

## Synthetic data

In [92]:
persons = [
        "John Smith", "Emily Chen", "Michael Johnson", "Sarah Williams",
        "David Lee", "Maria Rodriguez", "James Brown", "Davis",
        "Robert Kim", "Jennifer Lopez", "Thomas Wilson", "Jessica Taylor", "Cook",
        "Carlos Vega", "Aisha Patel", "Daniel Park", "Olivia Nguyen", "Musk", "Smith"
    ]

corporations = [
        "Google", "Microsoft", "Apple", "Amazon", "Meta", "BlackBerry",
        "IBM", "Tesla", "Netflix", "Walmart", "JP Morgan",
        "Acme Corp", "TechSolutions", "Global Systems", "DataWorks",
        "Quantum Industries", "NexGen", "FutureSpace", "EcoSystems", "Nokia", "Motorola"
    ]

products = [
        "iPhone 13", "Galaxy S22", "Surface Pro", "PlayStation 5", "Xbox Series X",
        "MacBook Air", "Echo Dot", "AirPods Pro", "Tesla Model 3", "iPad Mini",
        "Dyson V11", "Fitbit Charge", "Nintendo Switch", "Kindle Paperwhite",
        "Roomba i7", "GoPro Hero", "Bose QuietComfort", "Instant Pot", "Echo", "AirTag", "ThinkPad"
    ]

events = [
        "CES 2023", "Web Summit", "SXSW", "TechCrunch Disrupt", "E3 Expo",
        "Google I/O", "WWDC", "Consumer Electronics Show", "Mobile World Congress",
        "Black Hat Conference", "DEF CON", "AWS re:Invent", "Game Developers Conference",
        "Dreamforce", "Comic-Con", "Coachella", "New York Fashion Week", "GamesCom", "AI-Con"
    ]

locations = [
        "New York", "San Francisco", "London", "Tokyo", "Berlin",
        "Paris", "Sydney", "Toronto", "Chicago", "Seattle",
        "Los Angeles", "Miami", "Singapore", "Hong Kong", "Milwaukee",
        "Dubai", "Barcelona", "Austin", "Stockholm", "Seoul", "Vienna"
    ]

templates = [
        "{person} from {corporation} announced that {product} will be showcased at {event}.",
        "At {event}, {person} demonstrated how {product} is revolutionizing {corporation}'s approach in {location}.",
        "{corporation} has selected {location} as the venue for {event}, where {person} will launch {product}.",
        "The new {product} developed by {corporation} will be presented by {person} during {event} in {location}.",
        "{person} confirmed that {corporation} will be expanding its {product} line.",
        "According to {person}, {corporation}'s latest {product} has been well-received at {event} in {location}.",
        "Reviews from {event} suggest that {person} made a strong case for {corporation}'s new {product} in the {location} market.",
        "{corporation} is planning to open a {product} store in {location}, announced {person} at {event}.",
        "The collaboration between {corporation} and {person} resulted in {product}, which will finally debut at {event} in {location}.",
        "Attendees at {event} in {location} were very impressed when {person} revealed {corporation}'s innovative {product}! The clapping didnt stop",
        "{person} traveled to {location} to promote {product} at {event} on behalf of {corporation}.",
        "The {product} team from {corporation}, led by {person}, won first prize at {event} in {location}. Let's go!",
        "Consumers in {location} can now purchase {product} after {corporation}'s expansion announcement by {person} at {event}.",
        "{product} is the must-have gadget of the year.",
        "I hate the new {product}, the older ones are much better.",
        "Less than 2 hours until they announce the details on the {product} giveaway!",
        "All eyes are on {corporation} after the announcement of their new {product}.",
        "It's time for {person} to leave {corporation}. What is he even doing.",
        "{corporation} has been selected as the top AI startup in {location}, wow!",
        "I am having so many issues with the {product}. {corporation} needs to fix this!",
        "Can not wait for {product} also. They should sell them down at {event}.",
        "Whats happening at {corporation}? {person} really needs to step up.",
        "{corporation} is giving free {product} to open source coders who are attending this meet-up.",
        "{person} was right! The {product} from {corporation} is revolutionary!",
        "Less than 2 hours until we announce the details on the {product} giveaway!",
        "{corporation} CEO {person}: Newest {product} rollout will begin next month!",
        "{corporation} has a temporary Retail Store in {location} for the {product} release today. Opens at 5pm.",
        "{person} said that {corporation} is working on something big.",
        "It's time for {person} to leave {corporation}. What is he even doing?",
        "{corporation} just keeps raising the bar with every {product} they launch. Crazy!",
        "{person} just hinted at new features in {corporation}'s upcoming {product}. I am hyped!",
        "Rumors say {corporation} is releasing {product} soon.",
        "Just watched the {corporation} keynote. {product} looks impressive.",
        "Is it just me, or does {corporation}'s {product} feel rushed and unfinished?",
        "The {product} is making me rethink my loyalty to {corporation}. Its not good.",
        "Who else is gonna get the new {product} next month?",
        "{corporation}'s industry party tonight was great for the launch of {product}.",
        "Attending {event} this week! Can't wait to see what {corporation} unveils about their upcoming {product}. Anyone else going?",
        "Is anyone else experiencing issues with the new {product} update? {corporation}'s support hasn't been helpful. #TechSupport",
        "Just switched from {product} to {product} and the difference is incredible. {corporation} really cooked with this one!",
        "Hot take: {corporation}'s approach to development is outdated. They need to focus more on usability if they want to compete with {corporation}",
        "The new update to {product} completely revolutionized my workflow. Thanks {corporation} for fixing the issue! #ProductivityTech",
        "Arrived at {event} in {location}! The {corporation} booth is already packed with people trying the new {product}. #TechConference",
        "Just spotted {person} from {corporation} at a restaurant in {location} right after {event}. Tried to ask about {product} rumors but no comment!",
        "I snuck into the VIP section at {event} in {location} and got a selfie with {person}! Check my Insta! #Winning",
        "PSA: Free {product} giveaways at {corporation}'s booth at {event} in {location}! Run don't walk, peeps! I got the last blue one",
        "This {product} launch line at {corporation}'s store in {location} is ridiculous. Been here 3hrs and moved like 10 feet, But I NEED it today! #TechAddict",
        "Shoutout to the nice {corporation} rep at {event} in {location} who gave me an extra {product} for my kid! Some tech people are actually decent humans",
        "My {product} just updated itself and now I can't find ANYTHING. Hey {corporation}, stop 'fixing' stuff that ain't broken! {person} needs to chill with these changes",
        "Omg {person} just liked my tweet criticizing {corporation}'s {product}! Screenshot this before they realize and unlike!",
        "The way {person} casually uses {product} in interviews makes it seem so cool, but when I bought it from {corporation} it's just... meh. Marketing wins again",
        "new CEO {person} has really not done much yet at {corporation}, hasnt he?"
    ]

In [None]:
import random
# reworked new function
def generate_ner_dataset(num_examples, output_file="new_synthetic_ner_dataset.csv", templates=templates,
                         persons=persons, corporations=corporations, products=products, events=events, 
                         locations=locations):
    data = []
    
    for _ in range(num_examples):
        template = random.choice(templates)
        
        person = random.choice(persons)
        corporation = random.choice(corporations)
        product = random.choice(products)
        event = random.choice(events)
        location = random.choice(locations)
        
        sentence = template.format(
            person=person,
            corporation=corporation,
            product=product,
            event=event,
            location=location
        )
        
        tokens = []
        labels = []
        
        raw_words = []
        current_word = ""
        for char in sentence:
            if char.isalnum() or char in "-'":
                current_word += char
            else:
                if current_word:
                    raw_words.append(current_word)
                    current_word = ""
                if not char.isspace():
                    raw_words.append(char)
        if current_word:
            raw_words.append(current_word)
        
        i = 0
        while i < len(raw_words):
            token = raw_words[i]
            
            found_entity = False
            
            if i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in persons:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-person")
                labels.append("I-person")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-product")
                labels.append("I-product")
                labels.append("I-product")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in products:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-product")
                labels.append("I-product")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 2 and f"{raw_words[i]} {raw_words[i+1]} {raw_words[i+2]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                tokens.append(raw_words[i+2])
                labels.append("B-event")
                labels.append("I-event")
                labels.append("I-event")
                i += 3
                found_entity = True
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in events:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-event")
                labels.append("I-event")
                i += 2
                found_entity = True
                
            elif i < len(raw_words) - 1 and f"{raw_words[i]} {raw_words[i+1]}" in locations:
                tokens.append(raw_words[i])
                tokens.append(raw_words[i+1])
                labels.append("B-location")
                labels.append("I-location")
                i += 2
                found_entity = True
                
            if not found_entity:
                if token in [name.split()[0] for name in persons]:
                    tokens.append(token)
                    labels.append("B-person")
                    i += 1
                elif token in corporations:
                    tokens.append(token)
                    labels.append("B-corporation")
                    i += 1
                elif token in products:
                    tokens.append(token)
                    labels.append("B-product")
                    i += 1
                elif token in events:
                    tokens.append(token)
                    labels.append("B-event")
                    i += 1
                elif token in locations:
                    tokens.append(token)
                    labels.append("B-location")
                    i += 1
                else:
                    tokens.append(token)
                    labels.append("O")
                    i += 1
        
        data.append({"tokens": tokens, "labels": labels, "sentence": sentence})
    
    df = pd.DataFrame(data)
        
    return df

In [94]:
df_syn = generate_ner_dataset(5000)
df_syn = df_syn[["tokens", "labels"]]
df_syn.head()

In [95]:
# adding tag-column from label-column
def convert_to_numeric(label_list):
    return [label_to_id.get(label, -100) for label in label_list]

df_syn["tags"] = df_syn["labels"].apply(convert_to_numeric)
df_syn.head()

## Combining all NER-datasets

In [96]:
df_ner_corpus = pd.concat([wnut_complete, tweetner7_all, df_syn], axis=0).sample(frac=1, random_state=42).reset_index()
df_ner_corpus = df_ner_corpus[["tokens", "labels", "tags"]]
df_ner_corpus.head()

In [97]:
print(len(df_ner_corpus))
df_ner_corpus.dropna(inplace=True)
print(len(df_ner_corpus))

In [98]:
df_ner_corpus["labels"].explode().value_counts()

In [99]:
df_ner_corpus["labels"] = df_ner_corpus["labels"].apply(convert_unwanted_entities)

In [100]:
df_ner_corpus["labels"].explode().value_counts()

In [101]:
557394/len(df_ner_corpus["labels"].explode())

In [102]:
# overwriting the tags-column from tweetner7-tags to self-defined tags to fit new label-column
df_ner_corpus["tags"] = df_ner_corpus["labels"].apply(lambda x: [label_to_id[i] for i in x])
df_ner_corpus.head()

In [103]:
# replacing urls and user mentions in tokens-column
def format_token_list(token_list):
    formatted_tokens = []
    for token in token_list:
        # mask web urls
        if extractor.find_urls(token):
            formatted_tokens.append("{{URL}}")
        # format twitter mentions
        elif re.search(r'\S*@\S*', token):
            formatted_tokens.append("{{MENTION}}")
        else:
            formatted_tokens.append(token)
            
    return formatted_tokens

df_ner_corpus["tokens"] = df_ner_corpus["tokens"].apply(format_token_list)
df_ner_corpus.head()

In [104]:
# saving to json to preserve python data structures
df_ner_corpus.to_json("./NER_data/NER_corpus.json", orient="records")