# Comparing and combining the data sets

In [1]:
import pandas as pd

from combine import combine_datasets, normalize_grade, load_datasets
import altair as alt
vega_fusion = alt.data_transformers.enable("vegafusion")

## Combining the datasets

In [3]:
datasets = load_datasets()
datasets = list(map(normalize_grade, datasets))

df_combined = combine_datasets(datasets)
df_combined.drop(columns=[
    "reference_answer1",
    "reference_answer2",
    "file_name",
    "accuracy",
    "assignment",
    "student_id",    
],inplace=True)
df_combined.head()

Unnamed: 0,question,provided_answer,reference_answer,grade,data_source,normalized_grade,id,answer_feedback,verification_feedback,split
0,Explain why circuit 4 is not a short circuit.,the battery is not contained in a closed path,Circuit 4 has no closed paths,3.0,Beetle,1.0,,,,
1,Explain why circuit 4 is not a short circuit.,the battery is not contained in a closed path,Circuit 4 has no closed paths,3.0,Beetle,1.0,,,,
2,Explain why circuit 4 is not a short circuit.,Because it is in an incomplete path,Circuit 4 has no closed paths,1.0,Beetle,0.333333,,,,
3,Explain why circuit 4 is not a short circuit.,The path does not close,Circuit 4 has no closed paths,3.0,Beetle,1.0,,,,
4,Explain why circuit 4 is not a short circuit.,The battery is not in a closed path.,Circuit 4 has no closed paths,3.0,Beetle,1.0,,,,


In [4]:
df_combined['data_source'] = pd.Categorical(df_combined.data_source)

alt.Chart(df_combined).mark_bar().encode(
    alt.X("data_source"),
    y='count()',
)

To verify that the normalizing is actually working as intended, we can compare the original distribution of a single data set with the normalized distribution. 

In [5]:
filtered_df = df_combined.loc[df_combined['data_source'] == "CU-NLP"]

In [6]:
alt.Chart(filtered_df).mark_bar().encode(
    alt.X("grade").bin(),
    y='count()',
)

In [7]:
alt.Chart(filtered_df).mark_bar().encode(
    alt.X("normalized_grade").bin(),
    y='count()',
)

That seems to be working well!  
Then we can have a look at the distributions over all the data: 

In [8]:
alt.Chart(df_combined).mark_bar().encode(
    alt.X("normalized_grade").title("Grade on a scale from 0 to 1").bin(),
    alt.Y('count()').title("Number of Records"),
    color=alt.Color('data_source:N').title("Data Source"),
).properties(
    width=500,
    height=300
)

In [9]:
df_combined.describe()

Unnamed: 0,grade,normalized_grade
count,7723.0,7723.0
mean,2.370238,0.526783
std,5.43211,0.342645
min,0.0,0.0
25%,1.0,0.285714
50%,1.0,0.333333
75%,3.0,1.0
max,100.0,1.0


In [10]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7723 entries, 0 to 170
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   question               7552 non-null   object  
 1   provided_answer        7723 non-null   object  
 2   reference_answer       7723 non-null   object  
 3   grade                  7723 non-null   float64 
 4   data_source            7723 non-null   category
 5   normalized_grade       7723 non-null   float64 
 6   id                     2981 non-null   object  
 7   answer_feedback        2981 non-null   object  
 8   verification_feedback  2981 non-null   object  
 9   split                  2981 non-null   object  
dtypes: category(1), float64(2), object(7)
memory usage: 611.1+ KB


In [13]:
check_missing = df_combined.loc[pd.isnull(df_combined['question'])]
check_missing.head()

Unnamed: 0,question,provided_answer,reference_answer,grade,data_source,normalized_grade,id,answer_feedback,verification_feedback,split
0,,"If the question is IR-Based, I will do the fol...",Using semantic analysis methods. We can use ve...,25.0,CU-NLP,0.25,,,,
1,,"If the question is IR-Based, I will do the fol...","we should use vector space model. firstly, bin...",25.0,CU-NLP,0.25,,,,
2,,"If the question is IR-Based, I will do the fol...","Assume we have a question like "" Why the sky i...",50.0,CU-NLP,0.5,,,,
3,,"If the question is IR-Based, I will do the fol...",We can use Vector Space Model that is represen...,25.0,CU-NLP,0.25,,,,
4,,"If the question is IR-Based, I will do the fol...",In this corpus we will use sentence segmenting...,50.0,CU-NLP,0.5,,,,
