# JSON Question Data NaN Values Analysis

Comprehensive analysis of missing values in question data.

In [19]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
# Load the JSON file
# Replace 'your_file.json' with the path to your actual JSON file
#with open('test_generated_questions.json', 'r', encoding='utf-8') as file:
with open('test_generated_questions-filtered.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Detailed data inspection
print("Total number of items:", len(data))
print("\nStructure of first item:")
print(json.dumps(data[0], indent=2))

Total number of items: 1576

Structure of first item:
{
  "folder": "IC-353",
  "content": "{{knowledge objective\n|Identifiant=OIC-353-08-A\n|Item_parent=Suicidal risk and behaviour in children, adolescents and adults: identification and management\n|Item_parent_short=Suicidal risk and behaviour in children, adolescents and adults: identification and management\n|Rank=A\n|Title=Knowing what to do in an emergency\n|Description=None\n|Rubric=Management\n|Contributors=Pierre Vandel\n|Order=8}}\nFor people in a suicidal crisis or in the immediate aftermath of a suicide attack.\n\nA quiet welcome.\n\nKeeping the patient safe.\n\nA doctor must decide whether to admit the patient to hospital.\n\n'''Systematic psychiatric opinion:'''\n\nOnce her non-psychiatric clinical condition has stabilised\n\nPurpose of the psychiatric interview :\n\n* Assess the potential for suicidal crisis (risk/urgency/dangerousness), evaluate associated psychiatric disorders and guide treatment.\n* Decide on the ind

In [26]:
# Flatten the nested JSON structure
def flatten_json_data(data):
    """
    Flatten nested JSON data with specific columns
    
    Parameters:
    data (list): List of nested JSON objects
    
    Returns:
    pandas.DataFrame: Flattened DataFrame
    """
    flattened_data = []
    for item in data:
        # Extract question data with default to NaN if not present
        question_data = item.get('question', {})
        
        flattened_item = {
            'folder': item.get('folder', np.nan),
            'content': item.get('content', np.nan),
            'question': question_data.get('question', np.nan),
            'option_a': question_data.get('option_a', np.nan),
            'option_b': question_data.get('option_b', np.nan),
            'option_c': question_data.get('option_c', np.nan),
            'option_d': question_data.get('option_d', np.nan),
            'correct_option': question_data.get('correct_option', np.nan)
        }
        
        flattened_data.append(flattened_item)
    
    return pd.DataFrame(flattened_data)

# Convert to DataFrame
df_flat = flatten_json_data(data)

print("Flattened DataFrame Columns:")
print(df_flat.columns)

Flattened DataFrame Columns:
Index(['folder', 'content', 'question', 'option_a', 'option_b', 'option_c',
       'option_d', 'correct_option'],
      dtype='object')


In [27]:
# NaN Analysis Function
def analyze_nan_values(dataframe):
    """
    Analyze NaN values in a DataFrame.
    
    Parameters:
    dataframe (pandas.DataFrame): Input dataframe
    
    Returns:
    pandas.DataFrame: A summary of NaN values for each column
    """
    # Calculate total number of rows
    total_rows = len(dataframe)
    
    # Create a summary dataframe of NaN values
    nan_summary = pd.DataFrame({
        'Total_Rows': total_rows,
        'NaN_Count': dataframe.isna().sum(),
        'NaN_Percentage': (dataframe.isna().sum() / total_rows * 100).round(2),
        'Data_Type': dataframe.dtypes
    })
    
    # Sort the summary by NaN percentage in descending order
    nan_summary = nan_summary.sort_values('NaN_Percentage', ascending=False)
    
    return nan_summary

# Perform NaN analysis
nan_analysis = analyze_nan_values(df_flat)
print("NaN Values Analysis:")
print(nan_analysis)

NaN Values Analysis:
                Total_Rows  NaN_Count  NaN_Percentage Data_Type
folder                1576          0             0.0    object
content               1576          0             0.0    object
question              1576          0             0.0    object
option_a              1576          0             0.0    object
option_b              1576          0             0.0    object
option_c              1576          0             0.0    object
option_d              1576          0             0.0    object
correct_option        1576          0             0.0    object


In [None]:
# Visualize NaN distribution
plt.figure(figsize=(15, 8))
sns.heatmap(df_flat.isna(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('NaN Value Distribution Across Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of NaN percentages
plt.figure(figsize=(15, 8))
nan_analysis['NaN_Percentage'].plot(kind='bar')
plt.title('Percentage of NaN Values by Column')
plt.xlabel('Columns')
plt.ylabel('NaN Percentage')
plt.xticks(rotation=90, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Detailed NaN analysis
# In-depth analysis of columns with NaN values
columns_with_nan = nan_analysis[nan_analysis['NaN_Count'] > 0]
print("\nColumns with NaN Values:")
print(columns_with_nan)

# Calculate overall NaN coverage
total_nan = nan_analysis['NaN_Count'].sum()
total_cells = len(df_flat) * len(df_flat.columns)
overall_nan_percentage = (total_nan / total_cells * 100).round(2)
print(f"\nOverall NaN coverage: {overall_nan_percentage}%")

In [None]:
# Export results for further analysis
# Export NaN analysis to CSV
nan_analysis.to_csv('nan_analysis_summary.csv')

# Export rows with NaN values
df_with_nan = df_flat[df_flat.isna().any(axis=1)]
df_with_nan.to_csv('rows_with_nan.csv', index=False)

print("Analysis results exported to CSV files.")

## Insights and Recommendations

1. Review columns with high NaN percentages
2. Understand the context of missing data
3. Consider data collection or preprocessing improvements
4. Check exported CSV files for detailed NaN information