In [None]:
# #extract triples from explanations


# import pandas as pd
# import numpy as np
# from collections import Counter

# # Load the dataset
# df = pd.read_csv('generated.csv')

# import pandas as pd
# import spacy
# from spacy.matcher import Matcher

# # Load spaCy's English model
# nlp = spacy.load("en_core_web_sm")

# # Define a list of causal verbs/phrases
# causal_verbs = [
#     "cause", "causes", "causing",
#     "lead to", "leads to", "leading to",
#     "result in", "results in", "resulting in",
#     "trigger", "triggers", "triggering",
#     "stimulate", "stimulates", "stimulating",
#     "disrupt", "disrupts", "disrupting",
#     "release", "releases", "releasing",
#     "produce", "produces", "producing",
#     "increase", "increases", "increasing",
#     "decrease", "decreases", "decreasing",
#     "promote", "promotes", "promoting",
#     "exacerbate", "exacerbates", "exacerbating",
#     "trigger", "triggers", "triggering"
# ]

# # Initialize the Matcher with the shared vocabulary
# matcher = Matcher(nlp.vocab)

# # Add patterns to the matcher for each causal verb
# for verb in causal_verbs:
#     pattern = [{"LOWER": token} for token in verb.split()]
#     matcher.add(verb, [pattern])

# def extract_causal_relationships(text):
#     """
#     Extracts all causal relationships from a given text.
#     Returns a list of tuples (T1, T2, T3).
#     """
#     doc = nlp(text)
#     matches = matcher(doc)
#     causal_relationships = []

#     for match_id, start, end in matches:
#         span = doc[start:end]
#         verb = span.text

#         # Find the subject (nsubj) and object (dobj or pobj) of the verb
#         subject = None
#         obj = None

#         for token in span:
#             # Find the subject
#             for child in token.children:
#                 if child.dep_ in ("nsubj", "nsubjpass"):
#                     subject = child.text
#                     break

#             # Find the object
#             for child in token.children:
#                 if child.dep_ in ("dobj", "pobj"):
#                     obj = child.text
#                     break

#         # If subject or object is not directly connected, attempt to find via subtree
#         if not subject:
#             for tok in span.lefts:
#                 if tok.dep_ == "nsubj":
#                     subject = tok.text
#                     break

#         if not obj:
#             for tok in span.rights:
#                 if tok.dep_ in ("dobj", "pobj"):
#                     obj = tok.text
#                     break

#         # Append the relationship if both subject and object are found
#         if subject and obj:
#             causal_relationships.append((subject, verb, obj))

#     return causal_relationships if causal_relationships else [(None, None, None)]

# # Load the CSV file
# input_csv_path = 'generated.csv'  # Replace with your input CSV file path
# data = pd.read_csv(input_csv_path)

# # Initialize lists to store the extracted relationships
# extracted_data = {
#     "generated": [],
#     "T1": [],
#     "T2": [],
#     "T3": []
# }

# # Iterate over each generated explanation
# for idx, row in data.iterrows():
#     explanation = row['generated']
#     relationships = extract_causal_relationships(explanation)

#     for rel in relationships:
#         extracted_data["generated"].append(explanation)
#         extracted_data["T1"].append(rel[0])
#         extracted_data["T2"].append(rel[1])
#         extracted_data["T3"].append(rel[2])

# # Create a new DataFrame with the extracted relationships
# extracted_df = pd.DataFrame(extracted_data)

# # Optionally, drop rows where all T1, T2, T3 are None
# extracted_df.dropna(subset=["T1", "T2", "T3"], how='all', inplace=True)

# # Save the extracted relationships to a new CSV file
# output_csv_path = 'extracted_causal_relationships.csv'  # Replace with your desired output path
# extracted_df.to_csv(output_csv_path, index=False)

# print(f"Extraction complete. The results have been saved to '{output_csv_path}'.")


In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# Load the dataset
df = pd.read_csv('/content/variations.csv')

# Function to calculate token count
def get_token_count(column):
    return column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

# Function to calculate token-type ratio (TTR)
def token_type_ratio(column):
    total_tokens = column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).sum()
    unique_tokens = column.apply(lambda x: len(set(str(x).split())) if pd.notna(x) else 0).sum()
    return unique_tokens / total_tokens if total_tokens > 0 else 0

# Function to calculate lexical diversity
def lexical_diversity(column):
    tokens = [word for row in column.dropna() for word in str(row).split()]
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

# Column for word count (length) of the 'corrected' column
df['corrected_length'] = get_token_count(df['corrected'])

# Token-Type Ratio (TTR) for 'corrected' column
df['corrected_ttr'] = df['corrected'].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)

# Lexical Diversity for 'corrected' column
df['corrected_lexical_diversity'] = lexical_diversity(df['corrected'])

# Frequency of each word in the corrected column (word counts)
df['corrected_word_freq'] = df['corrected'].apply(lambda x: Counter(str(x).split()) if pd.notna(x) else Counter())

# Triple Length (sum of the lengths of T1, T2, T3)
df['triple_length'] = df[['T1', 'T2', 'T3']].apply(lambda row: get_token_count(pd.Series(row)).sum(), axis=1)

# Subject, Predicate, and Object Token-Type Ratios (TTR)
df['subject_ttr'] = df['T1'].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)
df['predicate_ttr'] = df['T2'].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)
df['object_ttr'] = df['T3'].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)

# Lexical diversity for subject, predicate, object
df['subject_lexical_diversity'] = lexical_diversity(df['T1'])
df['predicate_lexical_diversity'] = lexical_diversity(df['T2'])
df['object_lexical_diversity'] = lexical_diversity(df['T3'])

# Saving the updated CSV file with all new columns
df.to_csv('updated_dataset.csv', index=False)

print("Updated CSV with new metrics saved as 'updated_dataset.csv'.")


Updated CSV with new metrics saved as 'updated_dataset.csv'.


In [None]:
import pandas as pd
import numpy as np
from collections import Counter

# Load the dataset
df = pd.read_csv('/content/variations.csv')

# Calculate basic statistics
total_explanations = len(df['corrected'])
unique_subjects = df['T1'].nunique()  # Unique subjects (T1)
unique_predicates = df['T2'].nunique()  # Unique predicates (T2)
unique_objects = df['T3'].nunique()  # Unique objects (T3)

# Token-based analysis (split by spaces)
def get_token_count(column):
    return column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

# Token counts for subjects, predicates, and objects
df['subject_token_count'] = get_token_count(df['T1'])
df['predicate_token_count'] = get_token_count(df['T2'])
df['object_token_count'] = get_token_count(df['T3'])

# Total length statistics for triples
df['triple_length'] = df['subject_token_count'] + df['predicate_token_count'] + df['object_token_count']

# Calculate statistics for triple length
avg_triple_length = df['triple_length'].mean()
median_triple_length = df['triple_length'].median()
std_triple_length = df['triple_length'].std()
min_triple_length = df['triple_length'].min()
max_triple_length = df['triple_length'].max()

# Calculate the word count (length) of the 'corrected' column (explanations)
df['explanation_length'] = get_token_count(df['corrected'])

# Calculate statistics for explanation length
avg_explanation_length = df['explanation_length'].mean()
median_explanation_length = df['explanation_length'].median()
std_explanation_length = df['explanation_length'].std()
min_explanation_length = df['explanation_length'].min()
max_explanation_length = df['explanation_length'].max()

# Token-Type Ratio (TTR)
def token_type_ratio(column):
    total_tokens = column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).sum()
    unique_tokens = column.apply(lambda x: len(set(str(x).split())) if pd.notna(x) else 0).sum()
    return unique_tokens / total_tokens if total_tokens > 0 else 0

# Calculate TTR for subjects, predicates, objects, and explanations
subject_ttr = token_type_ratio(df['T1'])
predicate_ttr = token_type_ratio(df['T2'])
object_ttr = token_type_ratio(df['T3'])
explanation_ttr = token_type_ratio(df['corrected'])

# Lexical Diversity (overall)
def lexical_diversity(column):
    tokens = [word for row in column.dropna() for word in str(row).split()]
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

# Lexical diversity for subjects, predicates, objects, and explanations
lexical_diversity_subject = lexical_diversity(df['T1'])
lexical_diversity_predicate = lexical_diversity(df['T2'])
lexical_diversity_object = lexical_diversity(df['T3'])
lexical_diversity_explanation = lexical_diversity(df['corrected'])

# Frequency Distribution
subject_freq = Counter(df['T1'])
predicate_freq = Counter(df['T2'])
object_freq = Counter(df['T3'])

# Output statistics
print(f"Total Number of Explanations: {total_explanations}")
print(f"Number of Unique Subjects: {unique_subjects}")
print(f"Number of Unique Predicates: {unique_predicates}")
print(f"Number of Unique Objects: {unique_objects}\n")

print("=== Length-based Statistics for Explanations ===")
print(f"Average Explanation Length: {avg_explanation_length}")
print(f"Median Explanation Length: {median_explanation_length}")
print(f"Standard Deviation of Explanation Length: {std_explanation_length}")
print(f"Min Explanation Length: {min_explanation_length}")
print(f"Max Explanation Length: {max_explanation_length}\n")

print("=== Length-based Statistics for Triples ===")
print(f"Average Triple Length: {avg_triple_length}")
print(f"Median Triple Length: {median_triple_length}")
print(f"Standard Deviation of Triple Length: {std_triple_length}")
print(f"Min Triple Length: {min_triple_length}")
print(f"Max Triple Length: {max_triple_length}\n")

print("=== Token/Type Ratios ===")
print(f"Subject Token/Type Ratio: {subject_ttr}")
print(f"Predicate Token/Type Ratio: {predicate_ttr}")
print(f"Object Token/Type Ratio: {object_ttr}")
print(f"Explanation Token/Type Ratio: {explanation_ttr}\n")

print("=== Lexical Diversity ===")
print(f"Lexical Diversity of Subjects: {lexical_diversity_subject}")
print(f"Lexical Diversity of Predicates: {lexical_diversity_predicate}")
print(f"Lexical Diversity of Objects: {lexical_diversity_object}")
print(f"Lexical Diversity of Explanations: {lexical_diversity_explanation}\n")

print("=== Frequency Distributions ===")
print(f"Most Common Subjects: {subject_freq.most_common(5)}")
print(f"Most Common Predicates: {predicate_freq.most_common(5)}")
print(f"Most Common Objects: {object_freq.most_common(5)}")

# Syntax to rename a specific column in a DataFrame
df.rename(columns={'corrected': 'Explanations'}, inplace=True)


Total Number of Explanations: 101
Number of Unique Subjects: 91
Number of Unique Predicates: 8
Number of Unique Objects: 94

=== Length-based Statistics for Explanations ===
Average Explanation Length: 36.73267326732673
Median Explanation Length: 37.0
Standard Deviation of Explanation Length: 15.285870004097845
Min Explanation Length: 10
Max Explanation Length: 114

=== Length-based Statistics for Triples ===
Average Triple Length: 8.623762376237623
Median Triple Length: 8.0
Standard Deviation of Triple Length: 4.298491561346876
Min Triple Length: 3
Max Triple Length: 22

=== Token/Type Ratios ===
Subject Token/Type Ratio: 1.0
Predicate Token/Type Ratio: 1.0
Object Token/Type Ratio: 0.9874213836477987
Explanation Token/Type Ratio: 0.8711590296495957

=== Lexical Diversity ===
Lexical Diversity of Subjects: 0.6785714285714286
Lexical Diversity of Predicates: 0.07894736842105263
Lexical Diversity of Objects: 0.6415094339622641
Lexical Diversity of Explanations: 0.42749326145552563

=== F

In [None]:

# List of variation columns
variation_columns = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']

# Token-based analysis (split by spaces)
def get_token_count(column):
    return column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)

# Token/Type Ratio (TTR)
def token_type_ratio(column):
    total_tokens = column.apply(lambda x: len(str(x).split()) if pd.notna(x) else 0).sum()
    unique_tokens = column.apply(lambda x: len(set(str(x).split())) if pd.notna(x) else 0).sum()
    return unique_tokens / total_tokens if total_tokens > 0 else 0

# Lexical Diversity (overall)
def lexical_diversity(column):
    tokens = [word for row in column.dropna() for word in str(row).split()]
    return len(set(tokens)) / len(tokens) if len(tokens) > 0 else 0

# Length-based statistics for each variation
for var in variation_columns:
    df[f'{var}_length'] = get_token_count(df[var])

# Calculate TTR for each variation
for var in variation_columns:
    df[f'{var}_ttr'] = df[var].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)

# Calculate Lexical Diversity for each variation
for var in variation_columns:
    df[f'{var}_lexical_diversity'] = df[var].apply(lambda x: len(set(str(x).split())) / len(str(x).split()) if pd.notna(x) and len(str(x).split()) > 0 else 0)

# Compare each variation with the original explanation (token overlap percentage)
def token_overlap_ratio(orig, var):
    if pd.isna(orig) or pd.isna(var):
        return 0
    orig_tokens = set(str(orig).split())
    var_tokens = set(str(var).split())
    return len(orig_tokens & var_tokens) / len(orig_tokens) if len(orig_tokens) > 0 else 0

for var in variation_columns:
    df[f'{var}_overlap_with_original'] = df.apply(lambda row: token_overlap_ratio(row['Explanations'], row[var]), axis=1)

# Calculate frequency of words in all variations for each explanation
def word_frequency_in_variations(row):
    variations = [row[var] for var in variation_columns if pd.notna(row[var])]
    all_words = ' '.join(variations).split()
    return Counter(all_words)

df['variation_word_freq'] = df.apply(word_frequency_in_variations, axis=1)

# Output sample statistics for each explanation
print("=== Variation Length-based Statistics ===")
for var in variation_columns:
    print(f"Average Length of {var}: {df[f'{var}_length'].mean()}")
    print(f"Median Length of {var}: {df[f'{var}_length'].median()}")
    print(f"Standard Deviation of {var}: {df[f'{var}_length'].std()}")
    print(f"Min Length of {var}: {df[f'{var}_length'].min()}")
    print(f"Max Length of {var}: {df[f'{var}_length'].max()}\n")

print("=== Token/Type Ratios for Variations ===")
for var in variation_columns:
    print(f"{var} Token/Type Ratio: {df[f'{var}_ttr'].mean()}")

print("=== Lexical Diversity for Variations ===")
for var in variation_columns:
    print(f"Lexical Diversity of {var}: {df[f'{var}_lexical_diversity'].mean()}")

print("=== Overlap with Original Explanation ===")
for var in variation_columns:
    print(f"{var} Overlap with Original: {df[f'{var}_overlap_with_original'].mean()}")

# Display the first few rows of the updated dataframe with statistics
print(df.head())

=== Variation Length-based Statistics ===
Average Length of V1: 23.93069306930693
Median Length of V1: 18.0
Standard Deviation of V1: 15.898589513376697
Min Length of V1: 4
Max Length of V1: 75

Average Length of V2: 25.534653465346533
Median Length of V2: 21.0
Standard Deviation of V2: 16.15831944011235
Min Length of V2: 0
Max Length of V2: 102

Average Length of V3: 23.168316831683168
Median Length of V3: 21.0
Standard Deviation of V3: 14.890983383867365
Min Length of V3: 0
Max Length of V3: 88

Average Length of V4: 15.574257425742575
Median Length of V4: 15.0
Standard Deviation of V4: 14.588589057652879
Min Length of V4: 0
Max Length of V4: 52

Average Length of V5: 11.623762376237623
Median Length of V5: 0.0
Standard Deviation of V5: 16.66964395849445
Min Length of V5: 0
Max Length of V5: 60

Average Length of V6: 6.405940594059406
Median Length of V6: 0.0
Standard Deviation of V6: 12.960075785134737
Min Length of V6: 0
Max Length of V6: 48

Average Length of V7: 3.801980198019802

In [None]:
!pip install matplotlib seaborn wordcloud scikit-learn




In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have already loaded the dataset into df and defined the get_token_count function

# Calculate token count for 'corrected' column (the original explanations)
df['corrected_length'] = get_token_count(df['corrected'])

# Also calculate token count for variations (V1, V2, ..., V9)
variation_columns = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
for var in variation_columns:
    df[f'{var}_length'] = get_token_count(df[var])

# Now you can plot the distributions

# Plot histogram and KDE of lengths for original explanation and variations
plt.figure(figsize=(12, 6))
for var in ['corrected'] + variation_columns:
    sns.kdeplot(df[f'{var}_length'], label=f'{var} length', fill=True)

plt.title('Distribution of Token Lengths for Explanations and Variations')
plt.xlabel('Number of Tokens')
plt.ylabel('Density')
plt.legend()
plt.show()


KeyError: 'Explanations_length'

<Figure size 1200x600 with 0 Axes>

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df[[f'{var}_ttr' for var in variation_columns]], showmeans=True)
plt.title('Token-Type Ratio (TTR) Across Variations')
plt.xlabel('Variation')
plt.ylabel('Token-Type Ratio (TTR)')
plt.xticks(ticks=range(len(variation_columns)), labels=variation_columns)
plt.show()


In [None]:
lexical_diversities = [df[f'{var}_lexical_diversity'].mean() for var in variation_columns]

plt.figure(figsize=(12, 6))
sns.barplot(x=variation_columns, y=lexical_diversities, palette='viridis')
plt.title('Average Lexical Diversity Across Variations')
plt.xlabel('Variation')
plt.ylabel('Lexical Diversity')
plt.show()


In [None]:
from wordcloud import WordCloud

# Combine all variations' text into one large text
all_variations_text = ' '.join([' '.join(df[var].dropna()) for var in variation_columns])

# Generate WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_variations_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for All Variations')
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate pairwise cosine similarity between variations
def calculate_cosine_similarity(row):
    variations = [row[var] for var in variation_columns if pd.notna(row[var])]
    if len(variations) > 1:
        tfidf = TfidfVectorizer().fit_transform(variations)
        return cosine_similarity(tfidf).mean()
    return np.nan

# Apply cosine similarity for each row
df['cosine_similarity_variations'] = df.apply(calculate_cosine_similarity, axis=1)

# Plot Cosine Similarity distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['cosine_similarity_variations'].dropna(), kde=True, color='green')
plt.title('Distribution of Cosine Similarity Between Variations')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.show()


In [None]:
!pip install spacy scikit-learn



In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv('variations.csv')

# Function to extract required features from each explanation
def extract_features(text):
    if pd.isna(text):
        return {
            'total_words': 0,
            'avg_sentence_length': 0,
            'nouns_count': 0,
            'proper_nouns_count': 0,
            'plural_nouns_count': 0,
            'wh_determiners_count': 0,
            'coordinating_conjunctions_count': 0,
            'tree_height': 0,
            'tree_length': 0,
        }

    # Parse the text with spaCy
    doc = nlp(text)

    # Total number of words
    total_words = len([token for token in doc if token.is_alpha])

    # Average sentence length (in words)
    sentences = list(doc.sents)
    avg_sentence_length = sum(len(sent) for sent in sentences) / len(sentences) if sentences else 0

    # Noun counts (NN, NNP, NNS)
    nouns_count = sum(1 for token in doc if token.tag_ == 'NN')
    plural_nouns_count = sum(1 for token in doc if token.tag_ == 'NNS')
    proper_nouns_count = sum(1 for token in doc if token.tag_ == 'NNP')

    # WDT (wh-determiners like which, who, etc.)
    wh_determiners_count = sum(1 for token in doc if token.tag_ == 'WDT')

    # CC (coordinating conjunctions like and, but)
    coordinating_conjunctions_count = sum(1 for token in doc if token.tag_ == 'CC')

    # Tree height (depth of syntactic embedding)
    tree_height = max([len(list(token.ancestors)) for token in doc]) if doc else 0

    # Tree length (number of syntactic children per sentence)
    tree_length = sum(len([child for child in token.children]) for token in doc)

    return {
        'total_words': total_words,
        'avg_sentence_length': avg_sentence_length,
        'nouns_count': nouns_count,
        'plural_nouns_count': plural_nouns_count,
        'proper_nouns_count': proper_nouns_count,
        'wh_determiners_count': wh_determiners_count,
        'coordinating_conjunctions_count': coordinating_conjunctions_count,
        'tree_height': tree_height,
        'tree_length': tree_length,
    }

# Apply feature extraction to each explanation (corrected column)
df_features = df['corrected'].apply(extract_features).apply(pd.Series)

# Calculate TF-IDF for each explanation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['corrected'].fillna(''))

# Function to calculate average TF-IDF score for each explanation
def avg_tfidf_score(text, vectorizer, tfidf_matrix, index):
    if pd.isna(text):
        return 0
    feature_names = vectorizer.get_feature_names_out()
    tfidf_vector = tfidf_matrix[index].toarray().flatten()
    words = set(str(text).split())
    tfidf_scores = [tfidf_vector[vectorizer.vocabulary_.get(word, 0)] for word in words if word in vectorizer.vocabulary_]
    return sum(tfidf_scores) / len(tfidf_scores) if tfidf_scores else 0

# Calculate average TF-IDF for each explanation
df_features['avg_tfidf'] = [avg_tfidf_score(text, vectorizer, tfidf_matrix, idx) for idx, text in enumerate(df['corrected'])]

# Combine the extracted features into the original dataframe
df = pd.concat([df, df_features], axis=1)

# Save the results to a CSV file
df.to_csv('explanation_features.csv', index=False)

# Show sample results
print(df.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (after feature extraction)
df = pd.read_csv('explanation_features.csv')

# Set plot style
sns.set(style="whitegrid")

### 1. Total Words and Average Sentence Length Across Explanations

# Bar plot for total words
plt.figure(figsize=(12, 6))
sns.barplot(x=df.index, y=df['total_words'], palette="Blues_d")
plt.title('Total Words in Each Explanation')
plt.xlabel('Explanation Index')
plt.ylabel('Total Words')
plt.show()

# Bar plot for average sentence length
plt.figure(figsize=(12, 6))
sns.barplot(x=df.index, y=df['avg_sentence_length'], palette="Greens_d")
plt.title('Average Sentence Length in Each Explanation')
plt.xlabel('Explanation Index')
plt.ylabel('Average Sentence Length (words per sentence)')
plt.show()

### 2. Distribution of Tree Height and Tree Length (Syntactic Complexity)

# Box plot for tree height
plt.figure(figsize=(12, 6))
sns.boxplot(data=df['tree_height'], color='lightblue')
plt.title('Distribution of Syntactic Tree Height (Depth)')
plt.xlabel('Tree Height')
plt.show()

# Box plot for tree length (branching complexity)
plt.figure(figsize=(12, 6))
sns.boxplot(data=df['tree_length'], color='lightgreen')
plt.title('Distribution of Syntactic Tree Length (Branching Complexity)')
plt.xlabel('Tree Length (number of children)')
plt.show()

### 3. Histogram of Average TF-IDF Scores

# Histogram of average TF-IDF scores
plt.figure(figsize=(12, 6))
sns.histplot(df['avg_tfidf'], bins=20, kde=True, color='purple')
plt.title('Distribution of Average TF-IDF Scores')
plt.xlabel('Average TF-IDF')
plt.ylabel('Frequency')
plt.show()

### 4. Scatter Plot of Average Sentence Length vs. Average TF-IDF

# Scatter plot showing relationship between sentence length and average TF-IDF score
plt.figure(figsize=(17, 6))
sns.scatterplot(x=df['avg_sentence_length'], y=df['avg_tfidf'], s=100, color='darkred')
plt.title('Average Sentence Length vs. Average TF-IDF Score')
plt.xlabel('Average Sentence Length')
plt.ylabel('Average TF-IDF Score')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_csv('explanation_features.csv')

# Set plot style
sns.set(style="whitegrid", palette="muted")

### 1. Total Words and Average Sentence Length (Improved with Swarmplot)

# Swarm plot for total words with mean and standard deviation
plt.figure(figsize=(12, 6))
sns.swarmplot(x=df.index, y=df['total_words'], color='blue', size=5, alpha=0.7)
mean_total_words = df['total_words'].mean()
std_total_words = df['total_words'].std()
plt.axhline(mean_total_words, color='red', linestyle='--', label=f'Mean: {mean_total_words:.2f}')
plt.fill_between(df.index, mean_total_words - std_total_words, mean_total_words + std_total_words, color='red', alpha=0.1, label=f'Standard Deviation: {std_total_words:.2f}')
plt.title('Total Words in Each Explanation (with Mean and Std Dev)')
plt.xlabel('Explanation Index')
plt.ylabel('Total Words')
plt.legend()
plt.show()

# Swarm plot for average sentence length with summary statistics
plt.figure(figsize=(12, 6))
sns.swarmplot(x=df.index, y=df['avg_sentence_length'], color='green', size=5, alpha=0.7)
mean_avg_sentence_length = df['avg_sentence_length'].mean()
std_avg_sentence_length = df['avg_sentence_length'].std()
plt.axhline(mean_avg_sentence_length, color='red', linestyle='--', label=f'Mean: {mean_avg_sentence_length:.2f}')
plt.fill_between(df.index, mean_avg_sentence_length - std_avg_sentence_length, mean_avg_sentence_length + std_avg_sentence_length, color='red', alpha=0.1, label=f'Standard Deviation: {std_avg_sentence_length:.2f}')
plt.title('Average Sentence Length in Each Explanation (with Mean and Std Dev)')
plt.xlabel('Explanation Index')
plt.ylabel('Average Sentence Length (words per sentence)')
plt.legend()
plt.show()

### 2. Improved Tree Height and Tree Length Visualization (Violin Plot)

# Violin plot for tree height (with KDE to show distribution shape)
plt.figure(figsize=(12, 6))
sns.violinplot(x='tree_height', data=df, inner='quartile', color='skyblue')
mean_tree_height = df['tree_height'].mean()
median_tree_height = df['tree_height'].median()
plt.axvline(mean_tree_height, color='red', linestyle='--', label=f'Mean: {mean_tree_height:.2f}')
plt.axvline(median_tree_height, color='green', linestyle='-', label=f'Median: {median_tree_height:.2f}')
plt.title('Distribution of Syntactic Tree Height (with Mean and Median)')
plt.xlabel('Tree Height (Depth)')
plt.ylabel('Density')
plt.legend()
plt.show()

# Violin plot for tree length (number of children per token)
plt.figure(figsize=(12, 6))
sns.violinplot(x='tree_length', data=df, inner='quartile', color='lightgreen')
mean_tree_length = df['tree_length'].mean()
median_tree_length = df['tree_length'].median()
plt.axvline(mean_tree_length, color='red', linestyle='--', label=f'Mean: {mean_tree_length:.2f}')
plt.axvline(median_tree_length, color='green', linestyle='-', label=f'Median: {median_tree_length:.2f}')
plt.title('Distribution of Syntactic Tree Length (with Mean and Median)')
plt.xlabel('Tree Length (Number of Syntactic Children)')
plt.ylabel('Density')
plt.legend()
plt.show()

### 3. Improved Histogram for Average TF-IDF Scores

# Histogram with KDE for average TF-IDF scores (with summary statistics)
plt.figure(figsize=(12, 6))
sns.histplot(df['avg_tfidf'], bins=20, kde=True, color='purple', alpha=0.7)
mean_tfidf = df['avg_tfidf'].mean()
median_tfidf = df['avg_tfidf'].median()
plt.axvline(mean_tfidf, color='red', linestyle='--', label=f'Mean: {mean_tfidf:.2f}')
plt.axvline(median_tfidf, color='green', linestyle='-', label=f'Median: {median_tfidf:.2f}')
plt.title('Distribution of Average TF-IDF Scores (with Mean and Median)')
plt.xlabel('Average TF-IDF')
plt.ylabel('Frequency')
plt.legend()
plt.show()

### 4. Scatter Plot of Sentence Length vs TF-IDF (with Regression Line)

# Scatter plot with regression line
plt.figure(figsize=(12, 6))
sns.regplot(x=df['avg_sentence_length'], y=df['avg_tfidf'], scatter_kws={'color': 'darkblue', 's': 50}, line_kws={'color': 'red'})
plt.title('Average Sentence Length vs. Average TF-IDF Score (with Regression Line)')
plt.xlabel('Average Sentence Length (words per sentence)')
plt.ylabel('Average TF-IDF Score')
plt.show()


In [None]:
!pip install krippendorff

In [None]:
import pandas as pd

import krippendorff


# Data1 = pd.read_csv('informativeness.csv')
# Data2 = pd.read_csv('clarity.csv')
# Data3 = pd.read_csv('effectiveness.csv')

Data1 = pd.read_csv('informativeness_clean.csv')
Data2 = pd.read_csv('clarity_clean.csv')
Data3 = pd.read_csv('effectiveness_clean.csv')


# Data1 = Data1.dropna(axis = 0, how = 'all')
# Data2 = Data2.dropna(axis = 0, how = 'all')
# Data3 = Data3.dropna(axis = 0, how = 'all')


Data1  = Data1[Data1 .std(1)!=0]
Data2  = Data2[Data2 .std(1)!=0]
Data3  = Data3[Data3 .std(1)!=0]



low = .05
high = .95
quant_df = Data1.quantile([low, high])
Data1 = Data1.apply(lambda x: x[(x>quant_df.loc[low,x.name]) &
                                    (x < quant_df.loc[high,x.name])], axis=0)


low = .05
high = .95
quant_df = Data2.quantile([low, high])
Data2 = Data2.apply(lambda x: x[(x>quant_df.loc[low,x.name]) &
                                    (x < quant_df.loc[high,x.name])], axis=0)

low = .05
high = .95
quant_df = Data3.quantile([low, high])
Data3 = Data3.apply(lambda x: x[(x>quant_df.loc[low,x.name]) &
                                    (x < quant_df.loc[high,x.name])], axis=0)


print(krippendorff.alpha(reliability_data=Data1))
print(krippendorff.alpha(reliability_data=Data2))
print(krippendorff.alpha(reliability_data=Data3))



In [None]:
import pandas as pd
import krippendorff
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
Data1 = pd.read_csv('informativeness_clean.csv')
Data2 = pd.read_csv('clarity_clean.csv')
Data3 = pd.read_csv('effectiveness_clean.csv')

# Remove rows where all ratings are identical (std == 0)
Data1 = Data1[Data1.std(axis=1) != 0]
Data2 = Data2[Data2.std(axis=1) != 0]
Data3 = Data3[Data3.std(axis=1) != 0]

# Filter out extreme quantiles (top 5% and bottom 5%)
low = 0.05
high = 0.95

# Function to filter data based on quantiles
def filter_quantiles(df):
    quant_df = df.quantile([low, high])
    return df.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)

Data1 = filter_quantiles(Data1)
Data2 = filter_quantiles(Data2)
Data3 = filter_quantiles(Data3)

# Calculate Krippendorff's alpha for each dataset
alpha_informativeness = krippendorff.alpha(reliability_data=Data1.values)
alpha_clarity = krippendorff.alpha(reliability_data=Data2.values)
alpha_effectiveness = krippendorff.alpha(reliability_data=Data3.values)

# Print the Krippendorff's alpha results
print(f"Krippendorff's alpha (Informativeness): {alpha_informativeness}")
print(f"Krippendorff's alpha (Clarity): {alpha_clarity}")
print(f"Krippendorff's alpha (Effectiveness): {alpha_effectiveness}")

# Plot the ratings distribution for each dataset
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(Data1.stack(), ax=axes[0], color="blue", bins=20)
axes[0].set_title('Informativeness Ratings Distribution')

sns.histplot(Data2.stack(), ax=axes[1], color="green", bins=20)
axes[1].set_title('Clarity Ratings Distribution')

sns.histplot(Data3.stack(), ax=axes[2], color="red", bins=20)
axes[2].set_title('Effectiveness Ratings Distribution')

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Function to filter data based on quantiles
def filter_quantiles(df, low=0.05, high=0.95):
    quant_df = df.quantile([low, high])
    return df.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)

# Function to create plots for the data before and after outlier removal
def plot_distribution(data, filtered_data, title):
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Plot original data
    sns.histplot(data.stack(), ax=axes[0], bins=20, color='blue')
    axes[0].set_title(f'{title} Ratings (Before Outlier Removal)')

    # Plot filtered data
    sns.histplot(filtered_data.stack(), ax=axes[1], bins=20, color='green')
    axes[1].set_title(f'{title} Ratings (After Outlier Removal)')

    plt.tight_layout()
    plt.show()

# Function to display statistical summary before and after outlier removal
def display_stats(data, filtered_data, name):
    print(f"\n{name} - Before Outlier Removal")
    print(data.describe())

    print(f"\n{name} - After Outlier Removal")
    print(filtered_data.describe())

# Sample data loading - Replace these lines with your actual CSV data paths
# Data1 = pd.read_csv('informativeness_clean.csv')
# Data2 = pd.read_csv('clarity_clean.csv')
# Data3 = pd.read_csv('effectiveness_clean.csv')

# For demonstration, generate random data (replace this with your actual data)
import numpy as np
np.random.seed(42)
Data1 = pd.DataFrame(np.random.randint(1, 7, size=(100, 5)), columns=['Rater1', 'Rater2', 'Rater3', 'Rater4', 'Rater5'])
Data2 = pd.DataFrame(np.random.randint(1, 7, size=(100, 5)), columns=['Rater1', 'Rater2', 'Rater3', 'Rater4', 'Rater5'])
Data3 = pd.DataFrame(np.random.randint(1, 7, size=(100, 5)), columns=['Rater1', 'Rater2', 'Rater3', 'Rater4', 'Rater5'])

# Remove top 5% and bottom 5% outliers
Data1_filtered = filter_quantiles(Data1)
Data2_filtered = filter_quantiles(Data2)
Data3_filtered = filter_quantiles(Data3)

# Plotting distributions for each dataset before and after outlier removal
plot_distribution(Data1, Data1_filtered, "Informativeness")
plot_distribution(Data2, Data2_filtered, "Clarity")
plot_distribution(Data3, Data3_filtered, "Effectiveness")

# Displaying statistical summaries before and after outlier removal
display_stats(Data1, Data1_filtered, "Informativeness")
display_stats(Data2, Data2_filtered, "Clarity")
display_stats(Data3, Data3_filtered, "Effectiveness")


In [None]:
import pandas as pd
import krippendorff

# Function to filter data based on quantiles
def filter_quantiles(df, low=0.05, high=0.95):
    quant_df = df.quantile([low, high])
    return df.apply(lambda x: x[(x > quant_df.loc[low, x.name]) & (x < quant_df.loc[high, x.name])], axis=0)

# Function to calculate Krippendorff's alpha
def calculate_krippendorff_alpha(data, name):
    alpha = krippendorff.alpha(reliability_data=data.values)
    print(f"Krippendorff's alpha for {name}: {alpha}")

# Sample data loading - Replace these lines with your actual CSV data paths
Data1 = pd.read_csv('informativeness_clean.csv')
Data2 = pd.read_csv('clarity_clean.csv')
Data3 = pd.read_csv('effectiveness_clean.csv')


# Remove top 5% and bottom 5% outliers
Data1_filtered = filter_quantiles(Data1)
Data2_filtered = filter_quantiles(Data2)
Data3_filtered = filter_quantiles(Data3)

# Calculate Krippendorff's alpha before and after outlier removal
print("Krippendorff's Alpha (Before Outlier Removal):")
calculate_krippendorff_alpha(Data1, "Informativeness (Before)")
calculate_krippendorff_alpha(Data2, "Clarity (Before)")
calculate_krippendorff_alpha(Data3, "Effectiveness (Before)")

print("\nKrippendorff's Alpha (After Outlier Removal):")
calculate_krippendorff_alpha(Data1_filtered, "Informativeness (After)")
calculate_krippendorff_alpha(Data2_filtered, "Clarity (After)")
calculate_krippendorff_alpha(Data3_filtered, "Effectiveness (After)")


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data loading - replace with your actual dataset paths
# Assuming 'informativeness', 'clarity', and 'effectiveness' are the column names in your dataset
# df = pd.read_csv('evaluation_data.csv')

# For demonstration purposes, generating random data (replace this with actual data)
np.random.seed(42)
data = {
    'informativeness': np.random.randint(1, 7, 100),
    'clarity': np.random.randint(1, 7, 100),
    'effectiveness': np.random.randint(1, 7, 100)
}
df = pd.DataFrame(data)

# Pearson Correlation
pearson_corr = df.corr(method='pearson')
print("Pearson Correlation Matrix:")
print(pearson_corr)

# Spearman Correlation
spearman_corr = df.corr(method='spearman')
print("\nSpearman Correlation Matrix:")
print(spearman_corr)

# Plotting correlation heatmaps
plt.figure(figsize=(12, 5))

# Pearson heatmap
plt.subplot(1, 2, 1)
sns.heatmap(pearson_corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Pearson Correlation')

# Spearman heatmap
plt.subplot(1, 2, 2)
sns.heatmap(spearman_corr, annot=True, cmap="coolwarm", vmin=-1, vmax=1)
plt.title('Spearman Correlation')

plt.tight_layout()
plt.show()
