In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("merged_lexicon.csv")
df

Unnamed: 0,unit,hate-label
0,%e2%80%99ndrangheta,0
1,'ndrangheta,0
2,"(0, 0, 0)",1
3,(to get) dumped,0
4,(white) trash,0
...,...,...
12465,👮,0
12466,👶,1
12467,💢,1
12468,🚻,1


In [3]:
# Function to calculate statistics
def calculate_statistics(df):
    total_count = len(df)
    abusive_count = df[df["hate-label"] == 1].shape[0]
    non_abusive_count = df[df["hate-label"] == 0].shape[0]

    # Calculate the percentage of abusive words
    abusive_percentage = (abusive_count / total_count) * 100

    # Print statistics
    print(f"Total words in lexicon: {total_count}")
    print(f"Abusive words: {abusive_count}")
    print(f"Non-abusive words: {non_abusive_count}")
    print(f"Percentage of abusive words: {abusive_percentage:.2f}%")

# Calculate statistics for the base lexicon
calculate_statistics(df)

Total words in lexicon: 12470
Abusive words: 7648
Non-abusive words: 4822
Percentage of abusive words: 61.33%


# POS tags with Spacy

In [4]:
#  !pip install spacy
# python -m spacy download en_core_web_sm

In [5]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Function to extract POS tags
def extract_pos(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

# Apply the function to your DataFrame and create a new column
df["pos_tags"] = df["unit"].apply(extract_pos)

# Display the DataFrame with POS tags
# print(df)


In [6]:
df.head()

Unnamed: 0,unit,hate-label,pos_tags
0,%e2%80%99ndrangheta,0,"[NOUN, NUM]"
1,'ndrangheta,0,"[PUNCT, NOUN]"
2,"(0, 0, 0)",1,"[PUNCT, NUM, PUNCT, NUM, PUNCT, NUM, PUNCT]"
3,(to get) dumped,0,"[PUNCT, PART, AUX, PUNCT, VERB]"
4,(white) trash,0,"[PUNCT, ADJ, PUNCT, NOUN]"


In [7]:
# Filter the DataFrame to keep only rows with a single pos_tag
df_single_pos = df[df['pos_tags'].apply(lambda x: len(x) == 1)]

# Display the new dataset
df_single_pos

Unnamed: 0,unit,hate-label,pos_tags
5,0,1,[PUNCT]
6,187,1,[NUM]
7,1984,0,[NUM]
9,2b1ask1,1,[NUM]
12,419,1,[NUM]
...,...,...,...
12465,👮,0,[X]
12466,👶,1,[ADP]
12467,💢,1,[NOUN]
12468,🚻,1,[X]


In [8]:
import pandas as pd
# df_single_pos = df
# Create a new DataFrame with 'pos_tags' as a string
df_single_pos['pos_tags_str'] = df_single_pos['pos_tags'].apply(lambda x: x[0])

# Group the data by 'hate-label' and 'pos_tags_str' and count the occurrences
pos_tag_counts = df_single_pos.groupby(['hate-label', 'pos_tags_str']).size().unstack(fill_value=0)

# Calculate the total count for each 'hate-label' category
total_counts = df_single_pos['hate-label'].value_counts()

# Calculate the percentage for each 'hate-label' category
percentage_table = pos_tag_counts.divide(total_counts, axis=0) * 100

# Reset the index for a clean table
percentage_table.reset_index(inplace=True)

# Print the percentage table
# percentage_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_pos['pos_tags_str'] = df_single_pos['pos_tags'].apply(lambda x: x[0])


In [9]:
# Format the DataFrame to display percentages with two decimal places
percentage_table_formatted = percentage_table.copy()
percentage_table_formatted = percentage_table_formatted.round(2)
display(percentage_table_formatted)
latex_table = percentage_table_formatted.to_latex(index=False)
latex_table

pos_tags_str,index,ADJ,ADP,ADV,AUX,CCONJ,INTJ,NOUN,NUM,PRON,PROPN,PUNCT,VERB,X
0,0,23.14,0.2,2.27,0.07,0.0,0.11,37.23,0.09,0.02,7.53,0.02,29.17,0.15
1,1,12.72,0.19,4.26,0.0,0.03,0.24,47.28,0.11,0.05,15.02,0.11,19.77,0.22


  latex_table = percentage_table_formatted.to_latex(index=False)


'\\begin{tabular}{rrrrrrrrrrrrrr}\n\\toprule\n index &   ADJ &  ADP &  ADV &  AUX &  CCONJ &  INTJ &  NOUN &  NUM &  PRON &  PROPN &  PUNCT &  VERB &    X \\\\\n\\midrule\n     0 & 23.14 & 0.20 & 2.27 & 0.07 &   0.00 &  0.11 & 37.23 & 0.09 &  0.02 &   7.53 &   0.02 & 29.17 & 0.15 \\\\\n     1 & 12.72 & 0.19 & 4.26 & 0.00 &   0.03 &  0.24 & 47.28 & 0.11 &  0.05 &  15.02 &   0.11 & 19.77 & 0.22 \\\\\n\\bottomrule\n\\end{tabular}\n'

# POS tags with NLTK

In [10]:
import nltk
from nltk import word_tokenize, pos_tag

In [11]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
# Function to extract POS tags using NLTK
def extract_pos_nltk(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    pos_tags = [tag for _, tag in pos_tags]
    return pos_tags

# Apply the function to your DataFrame and create a new column
df["pos_tags"] = df["unit"].apply(extract_pos_nltk)
df

Unnamed: 0,unit,hate-label,pos_tags
0,%e2%80%99ndrangheta,0,"[NN, CC, NN, CD, NN, CD]"
1,'ndrangheta,0,[NNS]
2,"(0, 0, 0)",1,"[(, CD, ,, CD, ,, CD, )]"
3,(to get) dumped,0,"[(, TO, VB, ), VBD]"
4,(white) trash,0,"[(, JJ, ), NN]"
...,...,...,...
12465,👮,0,[NN]
12466,👶,1,[NN]
12467,💢,1,[NN]
12468,🚻,1,[NN]


In [13]:
# Filter the DataFrame to keep only rows with a single pos_tag
df_single_pos = df[df['pos_tags'].apply(lambda x: len(x) == 1)]

# Display the new dataset
df_single_pos

Unnamed: 0,unit,hate-label,pos_tags
1,'ndrangheta,0,[NNS]
5,0,1,[CD]
6,187,1,[CD]
7,1984,0,[CD]
9,2b1ask1,1,[CD]
...,...,...,...
12465,👮,0,[NN]
12466,👶,1,[NN]
12467,💢,1,[NN]
12468,🚻,1,[NN]


In [14]:
import pandas as pd
# df_single_pos = df
# Create a new DataFrame with 'pos_tags' as a string
df_single_pos['pos_tags_str'] = df_single_pos['pos_tags'].apply(lambda x: x[0])

# Group the data by 'hate-label' and 'pos_tags_str' and count the occurrences
pos_tag_counts = df_single_pos.groupby(['hate-label', 'pos_tags_str']).size().unstack(fill_value=0)

# Calculate the total count for each 'hate-label' category
total_counts = df_single_pos['hate-label'].value_counts()

# Calculate the percentage for each 'hate-label' category
percentage_table = pos_tag_counts.divide(total_counts, axis=0) * 100

# Reset the index for a clean table
percentage_table.reset_index(inplace=True)

# Print the percentage table
# percentage_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_single_pos['pos_tags_str'] = df_single_pos['pos_tags'].apply(lambda x: x[0])


In [15]:
# Format the DataFrame to display percentages with two decimal places
percentage_table_formatted = percentage_table.copy()
percentage_table_formatted = percentage_table_formatted.round(2)
display(percentage_table_formatted)
latex_table = percentage_table_formatted.to_latex(index=False)
latex_table

pos_tags_str,index,CD,IN,JJ,JJR,JJS,MD,NN,NNS,RB,VB,VBD,VBG,VBN
0,0,0.04,0.07,15.42,0.04,0.02,0.02,71.96,1.83,1.07,0.76,0.39,4.04,4.32
1,1,0.06,0.02,7.66,0.0,0.17,0.0,73.71,10.0,2.48,0.26,0.18,3.31,2.17


  latex_table = percentage_table_formatted.to_latex(index=False)


'\\begin{tabular}{rrrrrrrrrrrrrr}\n\\toprule\n index &   CD &   IN &    JJ &  JJR &  JJS &   MD &    NN &   NNS &   RB &   VB &  VBD &  VBG &  VBN \\\\\n\\midrule\n     0 & 0.04 & 0.07 & 15.42 & 0.04 & 0.02 & 0.02 & 71.96 &  1.83 & 1.07 & 0.76 & 0.39 & 4.04 & 4.32 \\\\\n     1 & 0.06 & 0.02 &  7.66 & 0.00 & 0.17 & 0.00 & 73.71 & 10.00 & 2.48 & 0.26 & 0.18 & 3.31 & 2.17 \\\\\n\\bottomrule\n\\end{tabular}\n'