In [103]:
# Torch + HF Transformers
import torch
import transformers
from transformers import AutoTokenizer

# Data Handeling
import pandas as pd
import datasets
from datasets import Dataset

# Text Cleaning
import spacy 

# OS Utils
import os
import shutil

# Data Pipeline

## Checking if GPU is enabled

In [2]:
!nvidia-smi

Mon Mar  7 14:17:49 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 496.13       Driver Version: 496.13       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:25:00.0  On |                  N/A |
| 28%   30C    P8    10W / 120W |   3010MiB /  3072MiB |      8%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [99]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce GTX 1060 3GB


## Load local Data

In [4]:

df = pd.read_csv("../data/labeled_data.csv") #load local data stored as csv
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [100]:
dataset = Dataset.from_pandas(df) # convert pandas dataframe into Huggingface dataset for later use of the Trainer() API

## Preprocess Data

In [89]:
nlp = spacy.load("en_core_web_sm") 
ruler = nlp.add_pipe("entity_ruler",config = {"overwrite_ents": True}) # Overwrite predefined Ents so only the below specified patterns are recognized

# general structure of a spacy pattern is {label:LABEL_NAME, pattern = [{pattern_type:pattern_string},{},...{}] list of subpatterns, id:ID_STRING} 
patterns = [
                {"label": "[URL]", "pattern": [{"TEXT":{"REGEX":'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'}}], "id":"url"},
                {"label": "[MENTION]", "pattern": [ {"TEXT":{"REGEX":"@[\w\-]+"}}], "id":"ment"},
                {"label": "[WHITESPACE]", "pattern": [ {"TEXT":{"REGEX":"\s+"}}], "id":"ws"}
            ]

ruler.add_patterns(patterns) # add patterns to ruler-object
print("ENTITY IDS = ",ruler.ent_ids)

ENTITY IDS =  ('url', 'ment', 'ws')


In [90]:
def preprocess_tweet(text_string): 
    doc = nlp(text_string)
    out_string = ""
    for token in doc: # iterate over all tokens in the document
        if token.ent_id_ == "": # is the token an entitiy of Null-Type then lemmatize it
            out_string = out_string + " " + token.lemma_
        else:
            if token.ent_id_ == "ws": # is the token just whitespace then forget the token
                pass
            else: # otherwise the token has to be a mention or a url so just append the name of the entity-type
                out_string = out_string + " " + token.ent_type_
    return out_string

In [94]:
test_str = """this is a beautiful teststring                               hey @yourlocaltwitteruser 
look at all this whitespace, i hate it, can you please google how to remove it? Use https://www.google.de """

print("TEST_STR:\n",test_str)
print("\n")
print("PROCESSED:\n",preprocess_tweet(test_str))

TEST_STR:
 this is a beautiful teststring                               hey @yourlocaltwitteruser 
look at all this whitespace, i hate it, can you please google how to remove it? Use https://www.google.de 


PROCESSED:
  this be a beautiful teststring hey [MENTION] look at all this whitespace , I hate it , can you please google how to remove it ? use [URL]


In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') #load correct tokenizer for uncased distilbert

MAX_SEQ_LEN = 128 # tweets are rarely over 128 tokens long
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) # adding padding
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # adding UNKNOWN-token for out of vocabulary situations


def preprocess_function(data):
    processed_tweets = [preprocess_tweet(tweet) for tweet in data["tweet"]] # preprocess all tweets in the data
    return tokenizer(processed_tweets,padding = "max_length",max_length=128,truncation=True) 

In [None]:
tokenized_dataset = dataset.map(preprocess_function,batched = True) # applying the map method allows for very quick batched processing of text data
tokenized_dataset = tokenized_dataset.rename_column("class","labels") # Huggingface Trainier expects naming conventions thus rename everything accordingly
tokenized_dataset = tokenized_dataset.remove_columns(["hate_speech","offensive_language","neither","count","Unnamed: 0"]) # remove unneccessary columns


100%|██████████| 25/25 [02:58<00:00,  7.13s/ba]


In [101]:
tokenized_dataset

NameError: name 'tokenized_dataset' is not defined

## Train Test Splitting

In [102]:
train_test_ds = tokenized_dataset.train_test_split(test_size=0.1) # split in train test sets with test_size = 10%
print(train_test_ds)

NameError: name 'tokenized_dataset' is not defined

## Saving to local directory

In [96]:

DATA_PATH = r"../data/processed_data"

if os.path.isdir(DATA_PATH):
    print("File already exists ,overwriting file...")
    shutil.rmtree(DATA_PATH) # this is neccesary since the huggingface method has no way of overwriting the given directory which will lead to OSError22
    train_test_ds.save_to_disk(r"../data/processed_data")
    print("File saved at " + os.path.abspath(DATA_PATH))
else:
    train_test_ds.save_to_disk(r"../data/processed_data")
    print("File saved at " + os.path.abspath(DATA_PATH))


False


TypeError: 'str' object is not callable

'c:\\Users\\MEGAPORT\\Documents\\Uni\\WiSe2021_22\\ProjektSeminar\\Project\\H8-Detect\\data\\processed_daa'