# Text Mining Project (5980UE)
# Hate-Speech Detection using HurtLex lexicon

University of Passau

Text Mining Project(5980UE)

Project Topic 3.16

by 

Salim Fares(fares01@ads.uni-passau.de)

Miran Mohammed Rashed(mohamm11@ads.uni-passau.de)


Supervised by 

Dr. Jelena Mitrović (jelena.mitrovic@uni-passau.de)


This file contains the data pre-processing functions

In [None]:
import pandas as pd
import re
from nltk import word_tokenize
from nltk.stem.isri import ISRIStemmer
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string

In [None]:
import tashaphyne.arabic_const as arabconst

In [None]:
df = pd.read_csv('Arabic_data.csv')
New_Arabic_Lexicon = pd.read_csv('New_Arabic_Lexicon')

### For English


In [None]:
df_En= pd.read_csv('Englis_data.csv')
New_English_lexicon = pd.read_csv('New_English_hurtlex.tsv', sep='\t')

In [None]:
def normalizeArabic(text):
    # Remove Tashkeel
    text = arabconst.HARAKAT_PAT.sub('', text)
    # Remove Repeated Characters
    text = re.sub(r'(.)\1+', r'\1', text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return text

In [None]:
def remove_single_chars(text):
    words = text.split(" ")
    text = " ".join([word for word in words if len(word) > 1])
    return text

In [None]:
def clean(text):
    text = normalizeArabic(text)
    # Remove Punctuations
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    text = text.translate(str.maketrans('', '', punctuations_list))
    # Remove Hashtag Signs
    text = re.sub(r"#", " ", text)
    # Remove URLs, Mentions, Trailing Non-Whitespaces Characters
    text = re.sub(r"(?:\@|https?\://)\S+", " ", text)
    # Remove Numbers
    text = re.sub(r"\d+", " ", text)
    # Remove English Characters
    text = re.sub(r"[A-Z|a-z]+", " ", text)
    # Remove Single Characters
    text = remove_single_chars(text)
    # Remove Stop Words
    text = " ".join([word for word in word_tokenize(text) if not word in stop_words])
    return str(text)
    

##  Clean function for English

In [None]:
stopwords = [x for x in stopwords.words('english')]
st = ISRIStemmer()

In [None]:
def clean(text):
  
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'

    # Remove URLs, Mentions, Trailing Non-Whitespaces Characters
    # Remove Numbers
    text = re.sub(giant_url_regex, '', text)
    text = re.sub(mention_regex, '', text)
    text = re.sub(giant_url_regex, '', text)
    
    # Remove Punctuations
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations_list = arabic_punctuations + english_punctuations
    text = text.translate(str.maketrans('', '', punctuations_list))
    # Remove Hashtag Signs
    text = re.sub(r"#", " ", text)
    # Remove emoji
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text=emoji_pattern.sub(r'', text)
    
      
    # Remove Stop Words
    text = " ".join([word for word in word_tokenize(text) if not word in stopwords])
    return str(text)

In [None]:
def stem(text):
    # Stemming
    text = " ".join([st.stem(word)for word in word_tokenize(text)])
    return text

In [None]:
df['clean'] = df['tweet'].apply(lambda x: clean(x))

In [None]:
df['stemmed'] = df['clean'].apply(lambda x: stem(x))

In [None]:
New_Arabic_Lexicon['clean'] = New_Arabic_Lexicon['lemma'].apply(lambda x: clean(x))

In [None]:
New_Arabic_Lexicon['stem'] = New_Arabic_Lexicon['clean'].apply(lambda x: stem(x))

In [None]:
New_Arabic_Lexicon['clean'] = New_Arabic_Lexicon['lemma'].apply(lambda x: clean(x))
New_Arabic_Lexicon['stem'] = New_Arabic_Lexicon['clean'].apply(lambda x: stem(x))

In [None]:
nan_value = float("NaN")
New_Arabic_Lexicon.replace("", nan_value, inplace=True)
New_Arabic_Lexicon.dropna(subset = ["clean"], inplace=True)
New_Arabic_Lexicon.reset_index().to_csv('old_lexicon.csv')

In [None]:
df.replace("", nan_value, inplace=True)
df.dropna(subset = ["clean"], inplace=True)
df.reset_index().to_csv('data.csv')

## For English

In [None]:
df_En['clean'] = df_En['tweet'].apply(lambda x: clean(x))

In [None]:
df_En['stemmed'] = df_En['clean'].apply(lambda x: stem(x))

In [None]:
New_English_lexicon['clean'] = New_English_lexicon['lemma'].apply(lambda x: clean(x))

In [None]:
New_English_lexicon['stem'] = New_English_lexicon['clean'].apply(lambda x: stem(x))