# Comparing and combining the data sets

In [1]:
import pandas as pd

from asag2024.configuration import * 
from asag2024.benchmark_pipeline.combine_pipeline import load_combined_asag2024
import altair as alt
# from data_analysis.altair_plots import altair_boxplot
# import nltk
from asag2024.configuration import data_source_colors, boxplot_height, boxplot_size, boxplot_width, FIGURES_PATH, altair_theme

# nltk.download('punkt')
vega_fusion = alt.data_transformers.enable("vegafusion")

alt.themes.register("main_theme", altair_theme)
alt.themes.enable("main_theme")

  from .autonotebook import tqdm as notebook_tqdm


ThemeRegistry.enable('main_theme')

In [2]:
%%html
<style>
@import url("https://fonts.googleapis.com/css2?family=JetBrains+Mono:ital,wght@0,100..800;1,100..800&display=swap")
</style>

## Combining the datasets

In [3]:
df_combined = load_combined_asag2024().set_index("index")

In [4]:
df_combined

Unnamed: 0_level_0,question,provided_answer,reference_answer,grade,data_source,normalized_grade,split,question_id,weight,grade_range
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Explain why you got a voltage reading of 0 for...,terminal 1 is connected to terminal 2,Terminals 1 and 2 are connected,3.0,Beetle,1.000000,,,0.000150,0.9-1.0
1,Explain why you got a voltage reading of 0 for...,they are connected,Terminals 1 and 2 are connected,3.0,Beetle,1.000000,,,0.000150,0.9-1.0
2,Explain why you got a voltage reading of 0 for...,Because terminal 1 and 2 are connected to the ...,Terminals 1 and 2 are connected,2.0,Beetle,0.666667,,,0.000272,0.6-0.7
3,Explain why you got a voltage reading of 0 for...,terminal 1 and 2 have the same difference in e...,Terminals 1 and 2 are connected,1.0,Beetle,0.333333,,,0.000238,0.3-0.4
4,Explain why you got a voltage reading of 0 for...,the voltage was 0 because they were on the sam...,Terminals 1 and 2 are connected,2.0,Beetle,0.666667,,,0.000272,0.6-0.7
...,...,...,...,...,...,...,...,...,...,...
641,,,Bias-variance dilemma is a principle supervise...,0.0,DigiKlausur,0.000000,,17.0,0.003546,0.0-0.1
642,,Bias is an proides an affine transformation. a...,Bias-variance dilemma is a principle supervise...,0.0,DigiKlausur,0.000000,,17.0,0.003546,0.0-0.1
643,,High bias and variance is desirable in input. ...,Bias-variance dilemma is a principle supervise...,1.0,DigiKlausur,0.500000,,17.0,0.001522,0.5-0.6
644,,Bias: Bias means how much the prediction diffe...,Bias-variance dilemma is a principle supervise...,1.0,DigiKlausur,0.500000,,17.0,0.001522,0.5-0.6


In [5]:
df_combined.drop(columns=["split", "grade_range", "question_id"]).to_parquet(HUGGINGFACE_OUTPUT_PATH)

# Visualisations

Now let's have a look at the data. For convenience, let's always use the same colors for the data sources:

In [14]:
df_combined['data_source'] = pd.Categorical(df_combined.data_source)

number_of_entries = alt.Chart(df_combined).mark_bar().encode(
    alt.X('count()').title("Number of entries"),
    alt.Y("data_source").title("").sort("x"),
    color=alt.Color("data_source", legend=None).title("Data Source")
)

count_text = number_of_entries.mark_text(
    align="center",
    baseline="middle",
    fontWeight="bold",
    dx=16,
    fill="black",
).encode(text="count()")

number_of_entries_chart = (number_of_entries + count_text).configure_range(
    category=alt.RangeScheme(data_source_colors)
).properties(
    width=boxplot_width,
    height=boxplot_height
)

number_of_entries_chart.save(FIGURES_PATH.joinpath("dataset_number_of_entries.png"), ppi=400)
number_of_entries_chart

To verify that the normalizing is actually working as intended, we can compare the original distribution of a single data set with the normalized distribution. 

In [5]:
filtered_df = df_combined.loc[df_combined['data_source'] == "CU-NLP"]

In [6]:
alt.Chart(filtered_df).mark_bar().encode(
    alt.X("grade").bin(),
    y='count()',
)

In [7]:
alt.Chart(filtered_df).mark_bar().encode(
    alt.X("normalized_grade").bin(),
    y='count()',
)

That seems to be working well!  

## Grade Distributions


In [8]:
alt.Chart(df_combined).mark_bar().encode(
    alt.X("normalized_grade").title("Grade on a scale from 0 to 1").bin(maxbins=5),
    alt.Y('count()').title("Number of Records"),
    color=alt.Color('data_source:N').title("Data Source"),
).properties(
    width=500,
    height=300
).configure_range(
    category=alt.RangeScheme(data_source_colors)
)

In [9]:
grade_distribution_chart = altair_boxplot(
    df_combined,
    y_variable="normalized_grade:Q",
    x_variable="data_source:N",
    y_name="Normalized Grade",
    width=boxplot_width,
    height=boxplot_height,
    boxplot_size=30,
    title="",
).configure_range(
    category=alt.RangeScheme(data_source_colors),
)
grade_distribution_chart.save(FIGURES_PATH.joinpath("grade_distribution.png"), ppi=400)
grade_distribution_chart

## Statistics about the combined dataset

First we can have a look at the missing values. 
We'll see that not all data points have a question present.

In [10]:
df_combined.info()

df_combined.nunique()

check_missing = df_combined.loc[pd.isnull(df_combined['question'])]
check_missing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18988 entries, 0 to 18987
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   index             18988 non-null  int64   
 1   question          17838 non-null  object  
 2   provided_answer   18949 non-null  object  
 3   reference_answer  18988 non-null  object  
 4   grade             18988 non-null  float64 
 5   data_source       18988 non-null  category
 6   normalized_grade  18988 non-null  float64 
 7   split             13267 non-null  object  
 8   question_id       979 non-null    float64 
dtypes: category(1), float64(3), int64(1), object(4)
memory usage: 1.2+ MB


Unnamed: 0,index,question,provided_answer,reference_answer,grade,data_source,normalized_grade,split,question_id
7034,0,,"If the question is IR-Based, I will do the fol...",Using semantic analysis methods. We can use ve...,25.0,CU-NLP,0.25,,
7035,1,,"If the question is IR-Based, I will do the fol...","we should use vector space model. firstly, bin...",25.0,CU-NLP,0.25,,
7036,2,,"If the question is IR-Based, I will do the fol...","Assume we have a question like "" Why the sky i...",50.0,CU-NLP,0.5,,
7037,3,,"If the question is IR-Based, I will do the fol...",We can use Vector Space Model that is represen...,25.0,CU-NLP,0.25,,
7038,4,,"If the question is IR-Based, I will do the fol...",In this corpus we will use sentence segmenting...,50.0,CU-NLP,0.5,,


### Character, Word and Sentence Count


In [11]:
calculate_word_count = lambda answer: len(nltk.word_tokenize(answer) if not pd.isna(answer) else [])
calculate_sentence_count = lambda answer: len(nltk.sent_tokenize(answer) if not pd.isna(answer) else [])

df_combined['sentence_count_provided_answer'] = df_combined['provided_answer'].apply(calculate_sentence_count)
df_combined['word_count_provided_answer'] = df_combined['provided_answer'].apply(calculate_word_count)

df_combined['sentence_count_reference_answer'] = df_combined['reference_answer'].apply(calculate_sentence_count)
df_combined['word_count_reference_answer'] = df_combined['reference_answer'].apply(calculate_word_count)

In [12]:
word_count_provided_answer_chart = altair_boxplot(
    df_combined,
    "word_count_provided_answer",
    "data_source:N",
    "Provided Answer",
    y_name="Number of Words",
    width=boxplot_width,
    height=boxplot_height,
    boxplot_size=boxplot_size
) 

word_count_reference_answer_chart = altair_boxplot(
    df_combined,
    "word_count_reference_answer",
    "data_source:N",
    "Reference Answer",
    domainMax=700,
    width=boxplot_width,
    height=boxplot_height,
    boxplot_size=boxplot_size
)

word_count_charts = (word_count_provided_answer_chart | word_count_reference_answer_chart)

In [13]:
sentence_count_provided_answer_chart = altair_boxplot(
    df_combined,
    "sentence_count_provided_answer",
    "data_source:N",
    y_name="Number of Sentences",
    x_name="Data Source",
    title="Provided Answer",
    width=boxplot_width,
    height=boxplot_height,
    boxplot_size=boxplot_size,
)

sentence_count_reference_answer_chart = altair_boxplot(
    df_combined,
    "sentence_count_reference_answer",
    "data_source:N",
    domainMax=35,
    x_name="Data Source",
    title="Reference Answer",
    width=boxplot_width,
    height=boxplot_height,
    boxplot_size=boxplot_size,
)

sentence_count_charts = (sentence_count_provided_answer_chart | sentence_count_reference_answer_chart)

TypeError: altair_boxplot() got an unexpected keyword argument 'x_name'

In [None]:
chart = alt.vconcat(word_count_charts, sentence_count_charts).configure_range(
    category=alt.RangeScheme(data_source_colors),
)

# chart.save(FIGURES_PATH.joinpatch("")

chart

## Manually checking the questions

Since there are only 88 unique questions, we can manually check their quality. 

In [None]:
# unique_questions = df_combined.groupby('question')

# unique_questions = df_combined[["question", "data_source"]].groupby(['question', 'data_source'], as_index=False, observed=True).first()
# unique_questions.to_csv("unique_questions.csv")