In [1]:
import pandas as pd

def read_data(path):
    """
    Reads data from the given file path and returns a DataFrame.

    Parameters:
        path (str): The file path to read the data from.

    Returns:
        pd.DataFrame: The data as a pandas DataFrame.
    """
    try:
        df = pd.read_csv(path)
        return df
    except Exception as e:
        print(f"Error reading the file: {e}")
        return None

In [2]:
meta_grouping_df = read_data("subject_M_Metadata.csv")
k_4_cluster_df = read_data("subject_M_Kmeans_4.csv")
k_optimal_cluster_df = read_data("subject_M_kmeans_8_optimal_k.csv")

In [6]:
meta_grouping_df = meta_grouping_df.drop(columns=['vector', 'x', 'y'])

In [7]:
k_4_cluster_df = k_4_cluster_df.drop(columns=['vector', 'x', 'y'])
k_optimal_cluster_df = k_optimal_cluster_df.drop(columns=['vector', 'x', 'y'])

In [8]:
meta_grouping_df.describe()

Unnamed: 0,S.No
count,388.0
mean,194.5
std,112.150197
min,1.0
25%,97.75
50%,194.5
75%,291.25
max,388.0


In [9]:
meta_grouping_df.head()

Unnamed: 0,S.No,question_id,prompt_name,subject,domain,skill,subskill,difficulty
0,1,me_ad07lpni,PID-M-ADM-NLF-EMT-MED,M,ADM,NLF,EMT,MED
1,2,me_784m01t5,PID-M-PSD-PER-EMT-EAS,M,PSD,PER,EMT,EAS
2,3,m_mm69f16h,PID-M-PSD-PCP-EMT-MED,M,PSD,PCP,EMT,MED
3,4,me_nj2cdu6j,PID-M-PSD-TVD-EMT-HAR,M,PSD,TVD,EMT,HAR
4,5,me_0xe0f6v5,PID-M-ALG-LEO-EMT-EAS,M,ALG,LEO,EMT,EAS


In [12]:
meta_grouping_df = meta_grouping_df.sort_values(by=['subject', 'domain', 'skill', 'subskill', 'difficulty'])
len(meta_grouping_df)

388

In [22]:
# # Merge the two dataframes
# merged_df = meta_grouping_df.merge(
#     k_4_cluster_df,
#     on='question_id',
#     suffixes=('_meta', '_cluster')
# )

# # Find majority domain per cluster
# cluster_domain_map = (
#     merged_df.groupby('cluster')['domain_meta']
#     .agg(lambda x: x.value_counts().index[0])
#     .to_dict()
# )

# # Assign new column: assigned domain
# merged_df['assigned_domain'] = merged_df['cluster'].map(cluster_domain_map)

# # Create a match flag
# merged_df['is_correct'] = merged_df['domain_meta'] == merged_df['assigned_domain']

# # Keep only needed columns
# final_df = merged_df[
#     ['question_id', 'domain_meta', 'domain_cluster', 'cluster', 'assigned_domain', 'is_correct']
# ]

# # Function to color the entire row based on is_correct
# def color_entire_row(row):
#     color = 'background-color: lightgreen' if row['is_correct'] else 'background-color: lightcoral'
#     return [color] * len(row)

# # Split into 4 DataFrames based on cluster
# cluster_dfs = {cluster_id: final_df[final_df['cluster'] == cluster_id] for cluster_id in sorted(final_df['cluster'].unique())}

# # Now style each cluster's dataframe
# styled_cluster_dfs = {
#     cluster_id: df.style.apply(color_entire_row, axis=1)
#     for cluster_id, df in cluster_dfs.items()
# }

# # To display them all in Jupyter
# for cluster_id, styled_df in styled_cluster_dfs.items():
#     display(f"Cluster {cluster_id}")
#     display(styled_df)


In [25]:
# 1. Merge the two dataframes
merged_df = meta_grouping_df.merge(k_4_cluster_df, on='question_id', suffixes=('_meta', '_cluster'))

# 2. Find majority domain per cluster
cluster_domain_map = (
    merged_df.groupby('cluster')['domain_meta']
    .agg(lambda x: x.value_counts().index[0])
    .to_dict()
)

# 3. Assign new column: assigned domain
merged_df['assigned_domain'] = merged_df['cluster'].map(cluster_domain_map)

# 4. Create a match flag
merged_df['is_correct'] = merged_df['domain_meta'] == merged_df['assigned_domain']

# 5. Visualize - Green if correct, Red if not
def color_row(val):
    return 'background-color: lightgreen' if val else 'background-color: lightcoral'

styled_df = merged_df.style.applymap(color_row, subset=['is_correct'])




  styled_df = merged_df.style.applymap(color_row, subset=['is_correct'])


In [26]:
styled_df.data.columns


Index(['S.No_meta', 'question_id', 'prompt_name_meta', 'subject_meta',
       'domain_meta', 'skill_meta', 'subskill_meta', 'difficulty_meta',
       'S.No_cluster', 'prompt_name_cluster', 'subject_cluster',
       'domain_cluster', 'skill_cluster', 'subskill_cluster',
       'difficulty_cluster', 'cluster', 'assigned_domain', 'is_correct'],
      dtype='object')

In [27]:
# Select the specified columns
selected_columns = ['question_id', 'domain_meta', 'domain_cluster', 'cluster', 'assigned_domain', 'is_correct']
selected_df = merged_df[selected_columns]

# Apply coloring for the 'is_correct' column
def color_row(val):
    return 'background-color: lightgreen' if val else 'background-color: lightcoral'

styled_selected_df = selected_df.style.applymap(color_row, subset=['is_correct'])

# Display the styled DataFrame
display(styled_selected_df)

# Optionally, write to an HTML file
styled_selected_df.to_html("selected_columns_colored.html", escape=False)

  styled_selected_df = selected_df.style.applymap(color_row, subset=['is_correct'])


Unnamed: 0,question_id,domain_meta,domain_cluster,cluster,assigned_domain,is_correct
0,m_4mohcovg,ADM,ADM,2,ADM,True
1,me_0pum04uz,ADM,ADM,2,ADM,True
2,me_7k1m1k4c,ADM,ADM,2,ADM,True
3,me_cpyzj9ob,ADM,ADM,2,ADM,True
4,fb878b65b0,ADM,ADM,2,ADM,True
5,me_8l02jlzg,ADM,ADM,2,ADM,True
6,m_n68e2hq7,ADM,ADM,2,ADM,True
7,m_8id2cgmh,ADM,ADM,2,ADM,True
8,m_o1t6cieb,ADM,ADM,2,ADM,True
9,m_sk65cxu6,ADM,ADM,0,ALG,False


In [18]:
!pip install itables


Collecting itables
  Using cached itables-2.3.0-py3-none-any.whl.metadata (8.6 kB)
Using cached itables-2.3.0-py3-none-any.whl (2.3 MB)
Installing collected packages: itables
Successfully installed itables-2.3.0


DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [19]:
import itables
from itables import show

In [20]:


itables.options.lengthMenu = [ [10, 25, 50, -1], [10, 25, 50, "All"] ]
itables.options.maxBytes = 0  # no size limit
itables.options.columnDefs = [{"targets": "_all", "className": "dt-center"}]

# After you created `merged_df` and 'is_correct' etc:

# Only for coloring, let's leave colors simple for now
show(merged_df)


S.No_meta,question_id,prompt_name_meta,subject_meta,domain_meta,skill_meta,subskill_meta,difficulty_meta,S.No_cluster,prompt_name_cluster,subject_cluster,domain_cluster,skill_cluster,subskill_cluster,difficulty_cluster,cluster,assigned_domain,is_correct
Loading ITables v2.3.0 from the internet... (need help?),,,,,,,,,,,,,,,,,
