In [None]:
import pandas as pd

# This is so far only for tone classification, as we don't have the results for specificity yet
llama = pd.read_csv('classification_LLama.csv')
mistral = pd.read_csv('classification_Mistral.csv')
aya = pd.read_csv('classification_Aya.csv')

data = pd.concat([llama, mistral, aya])

# Load the descriptors
descriptors_path = 'Descriptors Translated - Descriptors.csv'
descriptors_df = pd.read_csv(descriptors_path)



In [None]:
import itertools
# Define ranges for each language
ranges = {
    'English': itertools.chain(range(0, 420), range(0 + 1680, 420 + 1680), range(0 + 1680*2, 420 + 1680*2)),
    'Dutch': itertools.chain(range(420, 840), range(420 + 1680, 840 + 1680), range(420 + 1680*2, 840 + 1680*2)),
    'Chinese': itertools.chain(range(840, 1260), range(840 + 1680, 1260 + 1680), range(840 + 1680*2, 1260 + 1680*2)),
    'Italian': itertools.chain(range(1260, 1680), range(1260 + 1680, 1680 + 1680), range(1260 + 1680*2, 1680 + 1680*2))
}

# Split the main dataframe into language-specific dataframes
english_df = data[data['template_id'].isin(ranges['English'])]
dutch_df = data[data['template_id'].isin(ranges['Dutch'])]
chinese_df = data[data['template_id'].isin(ranges['Chinese'])]
italian_df = data[data['template_id'].isin(ranges['Italian'])]


# Individual results (wrong colours)

In [None]:
color_map = {
    'Care': 'orchid',
    'Sympathy': 'deepskyblue',
    'Patronising': 'tomato',
    'Disbelief': 'darkorange',
    'Curiosity': 'limegreen',
    'None of the above': 'gold'
}

import pandas as pd
import matplotlib.pyplot as plt

# Load the descriptors
descriptors_path = 'Descriptors Translated - Descriptors.csv'
descriptors_df = pd.read_csv(descriptors_path)

# Manually create the 'axis' column based on the provided descriptors
axis_values = (
    ['Sexuality'] * 10 + 
    ['Religion'] * 11 + 
    ['Gender'] * 10 + 
    ['Age'] * 17 + 
    ['Disability'] * 12 + 
    ['Nonce'] * 10
)
descriptors_df['axis'] = axis_values

# Display the column names and the first few rows of the descriptors file to verify the structure
print(descriptors_df.columns)
print(descriptors_df.head())

# Define the number of descriptors for each axis
num_descriptors = {
    'Sexuality': 10,
    'Religion': 11,
    'Gender': 10,
    'Age': 17,
    'Disability': 12,
    'Nonce': 10
}

# Function to map descriptors to template IDs
def map_descriptors_to_ids(base_id, num_descriptors):
    mapping = {}
    for axis, count in num_descriptors.items():
        descriptors = descriptors_df[descriptors_df['axis'] == axis]['English'].tolist()
        for i, descriptor in enumerate(descriptors):
            template_ids = list(range(base_id + i * 6, base_id + (i + 1) * 6))
            mapping[descriptor] = template_ids
        base_id += count * 6
    return mapping

# Create the mapping for each language
english_descriptor_map = map_descriptors_to_ids(0, num_descriptors)
dutch_descriptor_map = map_descriptors_to_ids(420, num_descriptors)
chinese_descriptor_map = map_descriptors_to_ids(840, num_descriptors)
italian_descriptor_map = map_descriptors_to_ids(1260, num_descriptors)

# Display a sample of the mapping
print("English Descriptor Map Sample:", list(english_descriptor_map.items())[:5])

# Function to analyze classification distribution for each descriptor grouped by axis
def analyze_descriptor_distribution_by_axis(descriptor_map, df):
    axis_analysis = {}
    for axis in num_descriptors.keys():
        axis_descriptors = {k: v for k, v in descriptor_map.items() if k in descriptors_df[descriptors_df['axis'] == axis]['English'].tolist()}
        descriptor_analysis = {}
        for descriptor, template_ids in axis_descriptors.items():
            descriptor_df = df[df['template_id'].isin(template_ids)]
            classification_counts = descriptor_df['classification'].value_counts()
            descriptor_analysis[descriptor] = classification_counts
        axis_analysis[axis] = descriptor_analysis
    return axis_analysis

# Perform the analysis for each language grouped by axis
english_axis_analysis = analyze_descriptor_distribution_by_axis(english_descriptor_map, english_df)
dutch_axis_analysis = analyze_descriptor_distribution_by_axis(dutch_descriptor_map, dutch_df)
chinese_axis_analysis = analyze_descriptor_distribution_by_axis(chinese_descriptor_map, chinese_df)
italian_axis_analysis = analyze_descriptor_distribution_by_axis(italian_descriptor_map, italian_df)

# Example: Display the analysis for English (as a DataFrame for better readability)
for axis, analysis in english_axis_analysis.items():
    english_axis_analysis_df = pd.DataFrame(analysis).fillna(0).T
    print(f"English - {axis}:\n", english_axis_analysis_df.head())

# Function to create grouped bar charts for descriptor distributions by axis
def plot_descriptor_distribution(language, descriptor_analysis):
    for axis, descriptors in descriptor_analysis.items():
        fig, axes = plt.subplots((len(descriptors) + 2) // 3, 3, figsize=(18, 12))
        fig.suptitle(f'Classification Distribution for {language} - Axis: {axis}', fontsize=16)
        
        for i, (descriptor, counts) in enumerate(descriptors.items()):
            ax = axes[i // 3, i % 3]
            if not counts.empty:
                #classification_counts = counts['classification'].value_counts()
                #colors = [color_map[classification] for classification in classification_counts.index]
                counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, ax=ax)
                ax.set_title(descriptor)
                ax.set_ylabel('Count')
                ax.set_xlabel('Classification')
            else:
                ax.set_visible(False)
        
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

# Plotting descriptor distribution for English
plot_descriptor_distribution("English", english_axis_analysis)

# You can similarly plot for other languages
plot_descriptor_distribution("Dutch", dutch_axis_analysis)
plot_descriptor_distribution("Chinese", chinese_axis_analysis)
plot_descriptor_distribution("Italian", italian_axis_analysis)


# Average per language per axis (doesn't work)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Function to analyze classifications for each axis
def analyze_classifications(axes):
    return {axis: df['classification'].value_counts() for axis, df in axes.items()}

# Perform the analysis for each language
english_analysis = analyze_classifications(english_axes)
dutch_analysis = analyze_classifications(dutch_axes)
chinese_analysis = analyze_classifications(chinese_axes)
italian_analysis = analyze_classifications(italian_axes)

# Convert analysis results to DataFrames for better readability
english_analysis_df = pd.DataFrame(english_analysis)
dutch_analysis_df = pd.DataFrame(dutch_analysis)
chinese_analysis_df = pd.DataFrame(chinese_analysis)
italian_analysis_df = pd.DataFrame(italian_analysis)

# Function to display DataFrame
def display_analysis(df, title):
    print(f"\n{title} Analysis:\n")
    print(df)
    df.plot(kind='bar', figsize=(14, 8), colormap='viridis', title=title)
    plt.show()

# Display the analysis results
display_analysis(english_analysis_df, "English")
display_analysis(dutch_analysis_df, "Dutch")
display_analysis(chinese_analysis_df, "Chinese")
display_analysis(italian_analysis_df, "Italian")


In [None]:
import matplotlib.pyplot as plt

# Function to create bar plots for each language and axis
def plot_classification_distribution(language, analysis):
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle(f'Classification Distribution for {language}', fontsize=16)
    
    for i, (axis, counts) in enumerate(analysis.items()):
        ax = axes[i // 3, i % 3]
        counts.plot(kind='pie', ax=ax)
        ax.set_title(axis)
        ax.set_ylabel('Count')
        ax.set_xlabel('Classification')
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

# Plotting the classification distribution for each language
plot_classification_distribution("English", english_analysis)
plot_classification_distribution("Dutch", dutch_analysis)
plot_classification_distribution("Chinese", chinese_analysis)
plot_classification_distribution("Italian", italian_analysis)


# Statistical test (doesn't work)

In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare

# Ensure the data is already split by axes and languages as previously done

# Example function to prepare data for Friedman test
def prepare_data_for_friedman(analysis):
    # Find the minimum count of classifications in any axis for consistency
    min_count = min(len(df) for df in analysis.values())
    
    # Truncate each axis dataframe to the minimum count
    prepared_data = {axis: df.head(min_count) for axis, df in analysis.items()}
    
    # Convert to a DataFrame for easier manipulation
    return pd.DataFrame(prepared_data)

# Prepare the data for each language
english_data = prepare_data_for_friedman(english_analysis)
dutch_data = prepare_data_for_friedman(dutch_analysis)
chinese_data = prepare_data_for_friedman(chinese_analysis)
italian_data = prepare_data_for_friedman(italian_analysis)

# Combine data across languages for each axis
def combine_data_for_axis(axis):
    return pd.DataFrame({
        'English': english_data[axis],
        'Dutch': dutch_data[axis],
        'Chinese': chinese_data[axis],
        'Italian': italian_data[axis]
    })

# Example for Gender axis
gender_data = combine_data_for_axis('Gender')


### Why does this not use all axes??

In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare

# Function to prepare data for Friedman test
def prepare_friedman_data(df):
    classifications = ['Care', 'Sympathy', 'Patronising', 'Disbelief', 'Curiosity', 'None of the above']
    data = {classification: df[df['classification'] == classification]['template_id'].tolist() for classification in classifications}
    
    # Ensure all groups are the same length by filling with NaNs (for the test to work properly)
    max_len = max(len(group) for group in data.values())
    for key in data.keys():
        if len(data[key]) < max_len:
            data[key].extend([float('nan')] * (max_len - len(data[key])))
    
    return pd.DataFrame(data)

# Function to perform the Friedman test
def perform_friedman_test(df):
    # Drop columns with all NaN values and ensure there are at least two columns with data
    df = df.dropna(axis=1, how='all')
    if df.shape[1] < 2:
        return None, None  # Not enough data to perform the test
    
    # Drop rows with NaN values as they can't be processed by friedmanchisquare
    df = df.dropna(axis=0, how='any')
    if df.shape[0] < 2:
        return None, None  # Not enough data to perform the test
    
    stat, p = friedmanchisquare(*[df[classification] for classification in df.columns])
    return stat, p

# Prepare and perform the test for each axis in each language
results = {}

for language, axes_data in {'English': english_axes, 'Dutch': dutch_axes, 'Chinese': chinese_axes, 'Italian': italian_axes}.items():
    for axis, df in axes_data.items():
        friedman_data = prepare_friedman_data(df)
        stat, p = perform_friedman_test(friedman_data)
        if stat is not None and p is not None:  # Only record valid results
            results[(language, axis)] = {'statistic': stat, 'p-value': p}

# Convert results to DataFrame for better readability
results_df = pd.DataFrame(results).T
print(results_df)

# Interpretation of results
def interpret_results(results):
    for (language, axis), result in results.items():
        print(f"Language: {language}, Axis: {axis}")
        print(f"  Friedman statistic: {result['statistic']}")
        print(f"  p-value: {result['p-value']}")
        if result['p-value'] < 0.05:
            print("  Significant differences found between classifications.")
        else:
            print("  No significant differences found between classifications.")
        print()

interpret_results(results)

