In [2]:
import pandas as pd
import os


## Base lexicon

In [5]:
# Define the path to the base lexicon file
path = r"../Assignment 3 - Group 11/Data/lexicon-of-abusive-words-master/Lexicons/"
base_lexicon_path = path + "baseLexicon.txt"

# Read the base lexicon into a DataFrame
df_base_lexicon = pd.read_csv(base_lexicon_path, sep='\t', header=None, names=["word", "label"], encoding="utf-8")

df_base_lexicon.head()

Unnamed: 0,word,label
0,Hun_noun,True
1,Jap_noun,True
2,Jihadi_noun,True
3,Yardie_noun,True
4,abasement_noun,False


In [4]:
# Function to calculate statistics
def calculate_statistics(df):
    total_count = len(df)
    abusive_count = df[df["label"] == True].shape[0]
    non_abusive_count = df[df["label"] == False].shape[0]

    # Calculate the percentage of abusive words
    abusive_percentage = (abusive_count / total_count) * 100

    # Print statistics
    print(f"Total words in lexicon: {total_count}")
    print(f"Abusive words: {abusive_count}")
    print(f"Non-abusive words: {non_abusive_count}")
    print(f"Percentage of abusive words: {abusive_percentage:.2f}%")

# Calculate statistics for the base lexicon
calculate_statistics(df_base_lexicon)

Total words in lexicon: 1650
Abusive words: 551
Non-abusive words: 1099
Percentage of abusive words: 33.39%


## Feature Base lexicon

In [6]:
featur_base_lexicon_path = path + "expandedLexicon.txt"
df_featur_base_lexicon = pd.read_csv(featur_base_lexicon_path, sep='\t', header=None, names=["word", "label"], encoding="utf-8")
df_featur_base_lexicon.head()

Unnamed: 0,word,label
0,horrible_noun,3.679601
1,disgusting_adj,3.493682
2,moron_noun,3.469677
3,bastard_noun,3.399238
4,stupid_noun,3.323882


## hurtlex_EN

In [4]:
path= r"../Assignment 3 - Group 11/Data/"
hurtlex_EN = path+"hurtlex_EN.tsv"

# Read the TSV file into a DataFrame
df_hurtlex_EN = pd.read_csv(hurtlex_EN, sep='\t', encoding='utf-8')
df_hurtlex_EN.head()

Unnamed: 0,id,pos,category,stereotype,lemma,level
0,EN1382,n,qas,no,gag reel,inclusive
1,EN7077,a,cds,no,snotty,conservative
2,EN6856,n,is,yes,mendicant,conservative
3,EN5485,n,re,no,maffias,conservative
4,EN5024,n,cds,no,lying in trade,conservative


In [6]:
def report_statistics(df):
    # Basic statistics
    num_rows = len(df)
    num_columns = len(df.columns)
    column_names = df.columns.tolist()
    
    # Data types of columns
    column_data_types = df.dtypes.tolist()
    
    # Summary statistics
    summary_stats = df.describe()
    
    # Unique values in each column
    unique_values = df.nunique()
    
    # Missing values
    missing_values = df.isnull().sum()
    
    # Display statistics in a more organized format
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}\n")
    
    print("Column names:")
    for column in column_names:
        print(f"  - {column}")
    print("\nData types of columns:")
    for column, data_type in zip(column_names, column_data_types):
        print(f"  - {column}: {data_type}")
    
    print("\nSummary statistics:")
    print(summary_stats)
    
    print("\nUnique values in each column:")
    for column, unique_count in unique_values.items():
        print(f"  - {column}: {unique_count} unique values")
    
    print("\nMissing values in each column:")
    for column, missing_count in missing_values.items():
        print(f"  - {column}: {missing_count} missing values")

# Usage example
report_statistics(df_hurtlex_EN)


Number of rows: 8228
Number of columns: 6

Column names:
  - id
  - pos
  - category
  - stereotype
  - lemma
  - level

Data types of columns:
  - id: object
  - pos: object
  - category: object
  - stereotype: object
  - lemma: object
  - level: object

Summary statistics:
           id   pos category stereotype lemma      level
count    8228  8228     8228       8228  8228       8228
unique   6072     4       17          2  5963          2
top     EN513     n      cds         no  fool  inclusive
freq       11  7401     2204       6248    11       4868

Unique values in each column:
  - id: 6072 unique values
  - pos: 4 unique values
  - category: 17 unique values
  - stereotype: 2 unique values
  - lemma: 5963 unique values
  - level: 2 unique values

Missing values in each column:
  - id: 0 missing values
  - pos: 0 missing values
  - category: 0 missing values
  - stereotype: 0 missing values
  - lemma: 0 missing values
  - level: 0 missing values


## mol.csv


In [7]:
path= r"../Assignment 3 - Group 11/Data/"
mol =path+"mol.csv"

# Read the csv file into a DataFrame
df_mol = pd.read_csv(mol)
# Display the first few rows
df_mol.head()


Unnamed: 0,term-or-expression,explicit-or-implicit,pt-brazilian-portuguese,pt-contextual-label,pt-hate-label,pt-deeply-culture-rooted,en-american-english,en-contextual-label,en-hate-label,es-latin-spanish,...,fr-african-french,fr-contextual-label,fr-hate-Label,ge-german,ge-contextual-label,ge-hate-Label,tu-turkish,tu-contextual-label,tu-hate-Label,Meaning Sources
0,term,explicit,chorume,1.0,0,0.0,rotten,0.0,0,0,...,0,0.0,0,Scheiße,0.0,0,bulamaç,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
1,term,explicit,baixaria,1.0,0,0.0,fuckfest,1.0,0,vileza,...,0,0.0,0,Betrüger,1.0,0,değersiz,1.0,0,https://michaelis.uol.com.br/moderno-portugues...
2,term,explicit,cu,0.0,0,0.0,asshole,0.0,0,culo,...,cul,0.0,0,Arsch,0.0,0,büzük,0.0,0,https://michaelis.uol.com.br/moderno-portugues...
3,expression,explicit,cu pra tomar,1.0,0,0.0,out on your ass,1.0,0,culo para coger,...,cul à prendre,1.0,0,Schieb es dir sonst wohin,1.0,0,götüne sok,1.0,0,https://dictionary.cambridge.org/pt/dicionario...
4,term,explicit,vagabundo,1.0,0,0.0,degenerate,1.0,0,vagabundo,...,clochard,1.0,0,Penner,1.0,0,serseri,1.0,0,https://michaelis.uol.com.br/moderno-portugues...


In [8]:
def report_statistics(df):
    # Basic statistics
    num_rows = len(df)
    num_columns = len(df.columns)
    column_names = df.columns.tolist()
    
    # Data types of columns
    column_data_types = df.dtypes.tolist()
    
    # Summary statistics
    summary_stats = df.describe()
    
    # Unique values in each column
    unique_values = df.nunique()
    
    # Missing values
    missing_values = df.isnull().sum()
    
    # Display statistics in a more organized format
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}\n")
    
    print("Column names:")
    for column in column_names:
        print(f"  - {column}")
    print("\nData types of columns:")
    for column, data_type in zip(column_names, column_data_types):
        print(f"  - {column}: {data_type}")
    
    print("\nSummary statistics:")
    print(summary_stats)
    
    print("\nUnique values in each column:")
    for column, unique_count in unique_values.items():
        print(f"  - {column}: {unique_count} unique values")
    
    print("\nMissing values in each column:")
    for column, missing_count in missing_values.items():
        print(f"  - {column}: {missing_count} missing values")
        
report_statistics(df_mol)

Number of rows: 1011
Number of columns: 22

Column names:
  - term-or-expression
  - explicit-or-implicit
  - pt-brazilian-portuguese
  - pt-contextual-label
  - pt-hate-label
  - pt-deeply-culture-rooted
  - en-american-english
  - en-contextual-label
  - en-hate-label
  - es-latin-spanish
  - es-contextual-label
  - es-hate-label
  - fr-african-french
  - fr-contextual-label
  - fr-hate-Label
  - ge-german
  - ge-contextual-label
  - ge-hate-Label
  - tu-turkish
  - tu-contextual-label
  - tu-hate-Label
  - Meaning Sources

Data types of columns:
  - term-or-expression: object
  - explicit-or-implicit: object
  - pt-brazilian-portuguese: object
  - pt-contextual-label: float64
  - pt-hate-label: object
  - pt-deeply-culture-rooted: float64
  - en-american-english: object
  - en-contextual-label: float64
  - en-hate-label: object
  - es-latin-spanish: object
  - es-contextual-label: float64
  - es-hate-label: object
  - fr-african-french: object
  - fr-contextual-label: float64
  - fr