In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
import nltk
from nltk.tokenize import word_tokenize

In [None]:
Path = "/content/"

In [None]:
expanded_lexicon = pd.read_csv(Path+"expandedLexicon.txt",sep="\t", header = None, names=["lemma", "off_nonoff"] )
# separate the words and pos in df1
expanded_lexicon[['lemma', 'pos']] = expanded_lexicon['lemma'].str.split('_', expand=True)
# df1.head()

In [None]:
# Define a function to convert positive values to 1 and else to 0
def convert_to_binary(value):
    if value > 0:
        return 1
    else:
        return 0

In [None]:
# Apply the conversion function to the "off_nonoff" column
expanded_lexicon['off_nonoff'] = expanded_lexicon['off_nonoff'].apply(lambda x: convert_to_binary(x))
expanded_lexicon.head()
# df1.to_csv("expanded_lexicon.csv")

Unnamed: 0,lemma,off_nonoff,pos
0,horrible,1,noun
1,disgusting,1,adj
2,moron,1,noun
3,bastard,1,noun
4,stupid,1,noun


In [None]:
expanded_lexicon['pos'] = expanded_lexicon['pos'].str.upper()
expanded_lexicon['pos'].unique()

array(['NOUN', 'ADJ', 'VERB'], dtype=object)

In [None]:
#reading data 2
hurtlex = pd.read_csv(Path+"hurtlex_EN.tsv", sep='\t')
hurtlex['off_nonoff'] = 1
hurtlex.head()
# hurtlex.to_csv("hurtlex.csv")

Unnamed: 0,id,pos,category,stereotype,lemma,level,off_nonoff
0,EN1382,n,qas,no,gag reel,inclusive,1
1,EN7077,a,cds,no,snotty,conservative,1
2,EN6856,n,is,yes,mendicant,conservative,1
3,EN5485,n,re,no,maffias,conservative,1
4,EN5024,n,cds,no,lying in trade,conservative,1


In [None]:
# Create a mapping dictionary
pos_mapping = {
    'n': 'NOUN',
    'a': 'ADJ',
    'v': 'VERB',
    'av': 'ADV'
}

# Map the 'pos' column values
hurtlex['pos'] = hurtlex['pos'].map(pos_mapping)

# Display the DataFrame to check the mapping
hurtlex['pos'].unique()

array(['NOUN', 'ADJ', 'VERB', 'ADV'], dtype=object)

In [None]:
#reading data 3
mol = pd.read_csv(Path +"mol.csv")
mol = mol[['term-or-expression', 'explicit-or-implicit','en-american-english', 'en-contextual-label', 'en-hate-label']]
mol.rename(columns={'en-american-english': 'lemma'}, inplace=True)
# df3['en-hate-label'].fillna(0, inplace=True)
mol.dropna(inplace=True)
print(mol.shape)
mol.head()

(999, 5)


Unnamed: 0,term-or-expression,explicit-or-implicit,lemma,en-contextual-label,en-hate-label
0,term,explicit,rotten,0.0,0
1,term,explicit,fuckfest,1.0,0
2,term,explicit,asshole,0.0,0
3,expression,explicit,out on your ass,1.0,0
4,term,explicit,degenerate,1.0,0


In [None]:
# Create a new column "off_nonoff" based on your conditions
mol['off_nonoff'] = mol['en-hate-label'].apply(lambda x: 0 if x == '0' else 1)
# mol.to_csv("mol.csv")

In [None]:
# Download required NLTK datasets
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Function to extract POS-tag for a sentence
def pos_tag_sentence(sentence):
    words = word_tokenize(sentence) # Tokenize the sentence
    tagged_words = nltk.pos_tag(words, tagset='universal') # Apply POS-tagging using universal tagset
    # Since you want to tag a single word (lemma), return the tag of the first word
    return tagged_words[0][1] if tagged_words else None

# Apply POS-tagging to each lemma and store results in new column
mol['pos'] = mol['lemma'].apply(pos_tag_sentence)
mol.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


Unnamed: 0,term-or-expression,explicit-or-implicit,lemma,en-contextual-label,en-hate-label,off_nonoff,pos
0,term,explicit,rotten,0.0,0,0,VERB
1,term,explicit,fuckfest,1.0,0,0,NOUN
2,term,explicit,asshole,0.0,0,0,NOUN
3,expression,explicit,out on your ass,1.0,0,0,ADP
4,term,explicit,degenerate,1.0,0,0,NOUN


In [None]:
mol['pos'].unique()

array(['VERB', 'NOUN', 'ADP', 'ADJ', 'NUM', 'PRT', '.', 'PRON', 'ADV',
       'DET'], dtype=object)

In [None]:
expanded_lexicon = expanded_lexicon[['lemma','pos','off_nonoff']]
hurtlex = hurtlex[['lemma','pos','off_nonoff']]
mol = mol[['lemma','pos','off_nonoff']]
merged_lexicon = pd.concat([expanded_lexicon, hurtlex, mol], axis=0, ignore_index=True)
merged_lexicon = merged_lexicon.sort_values(by='off_nonoff', ascending=False).drop_duplicates(subset='lemma', keep='first')
merged_lexicon.to_csv("merged_lexicon.csv")

In [None]:
def classify_text(text, lex, lexicon_df):


    offensive_lexicon = set(lexicon_df[lexicon_df['off_nonoff'] == 1]['lemma'].str.lower())
    nonoffensive_lexicon = set(lexicon_df[lexicon_df['off_nonoff'] == 0]['lemma'].str.lower())

    """Classify the text as 'OFF' or 'NOT' using lexicon-lookup."""
    words = text.lower().split()  # Tokenize and convert to lowercase

    # Count the occurrences of words in the lexicons

    offensive_count = sum([word in offensive_lexicon for word in words])
    nonoffensive_count = sum([word in nonoffensive_lexicon for word in words])

    # Classification rule
    if offensive_count > nonoffensive_count:
        return 1
    else:
        return 0


lexion_list = {"Expanded": expanded_lexicon ,"MOL": mol ,"Hurtlex": hurtlex, "Merged" : merged_lexicon}

for name, lex in lexion_list.items():
    print(name)
    # Load lexicon
    lexicon_df = lex

    # Load OLID dataset
    olid_df = pd.read_csv(Path +'olid-test.csv')

    # Assuming the OLID dataset has 'text' column containing text data and 'label' column with actual labels
    # olid_df['Predicted'] = olid_df['text'].apply(classify_text())
    olid_df['Predicted'] = olid_df['text'].apply(lambda x: classify_text(x, lex,lexicon_df))


    # Evaluate the performance
    accuracy = accuracy_score(olid_df['labels'], olid_df['Predicted'])
    report = classification_report(olid_df['labels'], olid_df['Predicted'])
    c_m = confusion_matrix(olid_df['labels'], olid_df['Predicted'])

    print(f"Accuracy: {accuracy}")
    print(report)
    print(f"Confusion Matrix:\n {c_m}")


Expanded
Accuracy: 0.7662790697674419
              precision    recall  f1-score   support

           0       0.77      0.96      0.86       620
           1       0.72      0.27      0.39       240

    accuracy                           0.77       860
   macro avg       0.75      0.61      0.62       860
weighted avg       0.76      0.77      0.73       860

Confusion Matrix:
 [[595  25]
 [176  64]]
MOL
Accuracy: 0.7267441860465116
              precision    recall  f1-score   support

           0       0.74      0.97      0.84       620
           1       0.56      0.10      0.16       240

    accuracy                           0.73       860
   macro avg       0.65      0.53      0.50       860
weighted avg       0.69      0.73      0.65       860

Confusion Matrix:
 [[602  18]
 [217  23]]
Hurtlex
Accuracy: 0.5930232558139535
              precision    recall  f1-score   support

           0       0.81      0.57      0.67       620
           1       0.37      0.64      0.47  