In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Analyze OCR and Other Read Result

In [2]:
import os
import json

important_keys = ["Education Background", "Awards and Professional Qualifications", 
                  "Publications", "English Language Proficiency", "Proposed Research Plan / Vision Statement", 
                  "CV", "Extracurricular Activities / Volunteer Work", "Taken Courses", "Reference Report"]

def read_json_to_dict(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Extract only the important keys, handling missing keys with None
    return {key: data.get(key, None) for key in important_keys}

def process_json_files(directory):
    data_list = []
    
    # Iterate through each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            filepath = os.path.join(directory, filename)
            row_data = read_json_to_dict(filepath)
            row_data.update({'id': filename.split('.')[0]})
            data_list.append(row_data)
    
    # Convert list of dicts to DataFrame
    return pd.DataFrame(data_list)

In [None]:
profile_num=[7,8,10]
version_num=12

dataList=[]
for num in profile_num:
    print("Round "+str(num))
    directory_path = f"\\csv_summary"#Replace with real directory path
    tempData=pd.read_csv(f"{directory_path}\\summary_with_prof_eval.csv")
    print(len(tempData))
    dataList.append(tempData)

data=pd.concat(dataList)
data=data.reset_index(drop=True)
data['llm pred pass'] = data['llm pred pass'] == True

In [7]:
data.drop('mean_id', axis=1, inplace=True)

In [None]:
def count_zeros_neg_ones(series):
    return ((series == 0) | (series == -1)).sum()

numerical_cols = ['overall score', 'gpa', 'math ability', 'mastery of language', 'specialty core courses', 'lab courses', 'capstone/design project', 'internship', 'community service', 'student clubs', "major int'l/national competition", 'college of bachelor degree']
abs_errors = pd.DataFrame()

for col in numerical_cols[1:]:
    summary_col = col.lower()
    mean_col = f'mean_{summary_col}'
    valid_rows = (data[summary_col] != -1) & (data[summary_col] != 0)
    abs_errors[summary_col] = (data.loc[valid_rows, summary_col] - data.loc[valid_rows, mean_col]).abs()

valid_rows = (data['weight score'] != -1) & (data['weight score'] != 0)
abs_errors['overall score']=(data.loc[valid_rows, 'weight score'] - data.loc[valid_rows, 'mean_overall score']).abs()
# Calculate the MAE for each column
mae = abs_errors.mean()
zero_neg_one_counts = data[numerical_cols[2:-1]].apply(count_zeros_neg_ones)
# Display the MAE
print("Mean Absolute Error (MAE):")
print(mae)
print("\nCount of 0 or -1 in each column of df_summary:")
print(zero_neg_one_counts)

In [None]:
data[[f'mean_{c}'for c in numerical_cols]].std()

# Read Data

In [439]:
temp_data = data[data.columns[~data.columns.str.contains('description')]]

# EDA

In [None]:
data.describe()

In [443]:
nan_rate = data.isna().mean() * 100
nan_rate

id                                               0.000000
name                                             0.000000
Major                                            0.000000
Summary                                          0.000000
gpa                                              0.000000
gpa_description                                  0.000000
mastery of language                              0.000000
mastery of language_description                  0.000000
math ability                                     0.000000
math ability_description                         0.000000
specialty core courses                           0.000000
specialty core courses_description               0.000000
lab courses                                      0.000000
lab courses_description                          0.000000
capstone/design project                          0.000000
capstone/design project_description              0.000000
internship                                       0.000000
internship_des

In [444]:
#merged_data = pd.merge(data, data_excel[["Pass pre-shortlist or not", "Application No."]], left_on='id', right_on='Application No.')
#merged_data['Passed'] = merged_data['Pass pre-shortlist or not']


In [445]:
# new_merged_data = merged_data.dropna()
data['Passed'] = data['score'].apply(lambda x: 'passed' if x > 0 else 'unpassed')
passed_students = data[data['score'] >0]
unpassed_students = data[data['score'] <=0]


In [None]:
plt.figure(figsize=(8, 6), facecolor="white")  # Set the figure size for better visibility
palette = {'passed': 'blue', 'unpassed': 'orange'}
barplot=sns.countplot(data=data, x='Passed', palette='pastel')
plt.title('Count of Passed vs. Unpassed Students',  fontsize=18)
plt.xlabel('Status',  fontsize=14)
plt.ylabel('Number of Students', fontsize=14)
plt.xticks(fontsize=14)  # Larger x-axis tick labels
for spine in barplot.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(0.5)
# plt.savefig('./figure/Total_passed_student.jpg')
plt.show()

In [None]:
data['Good_univeristy'] = data['college of bachelor degree'].dropna()>=90
print(data['Good_univeristy'].mean()*100)
cleaned_data = data.dropna(subset=['college of bachelor degree'])
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, x='Good_univeristy', y='gpa', palette='coolwarm')
plt.title('Comparison of GPA by University Status', fontsize=18)
plt.xlabel('Is Good University?', fontsize=14)
plt.ylabel('Average GPA', fontsize=14)
plt.xticks(fontsize=14)
plt.ylim(40, 100)
plt.show()

data['Good_english'] = data['mastery of language'].dropna()>=90
   
cleaned_data = data.dropna(subset=['mastery of language'])
print(data['Good_english'].mean()*100)
plt.figure(figsize=(8, 6))
sns.boxplot(data=data, x='Good_english', y='gpa', palette='coolwarm')
plt.title('Comparison of GPA by University English Capability', fontsize=18)
plt.xlabel('Is Good English?', fontsize=14)
plt.ylabel('Average GPA', fontsize=14)
plt.xticks(fontsize=14)
plt.ylim(30, 100)
plt.show()

In [None]:
temp_data = data[data.columns[~data.columns.str.contains('description')]]
pd.set_option('display.max_columns', 30)

contains_keyword = data['mastery of language_description'].str.contains("institution", na=False)
data['study abroad'] = pd.Categorical(contains_keyword, categories=[True, False], ordered=True)

# Now calculate the pass rates
study_abroad_pass_rate = data[data['study abroad'] == True]['Passed'].eq('passed').mean()
non_study_abroad_pass_rate = data[data['study abroad'] == False]['Passed'].eq('passed').mean()

# Set the display option for columns
pd.set_option('display.max_columns', 30)

# Create a bar plot
plt.figure(figsize=(8, 6))
sns.barplot(x=['Study Abroad', 'Non Study Abroad'], y=[study_abroad_pass_rate, non_study_abroad_pass_rate], palette='coolwarm')
plt.title('Pass Rate Comparison: Study Abroad vs. Non Study Abroad Students')
plt.ylabel('Pass Rate')
plt.ylim(0, 1)  # Set y-axis limit to ensure values are between 0 and 1
plt.show()

# study_abroad_pass_rate = (merged_data.loc[contains_keyword, 'Passed'] == 'passed').mean()
# non_study_abroad_pass_rate = (merged_data.loc[~contains_keyword, 'Passed'] == 'passed').mean()

# # Create a bar plot
# plt.figure(figsize=(8, 6))
# sns.barplot(x=['Study Abroad', 'Non Study Abroad'], y=[study_abroad_pass_rate, non_study_abroad_pass_rate])
# plt.title('Pass Rate Comparison: Study Abroad vs. Non Study Abroad Students')
# plt.ylabel('Pass Rate')
# plt.ylim(0, 1)  # Set y-axis limit to ensure values are between 0 and 1
# plt.show()

In [None]:
def mean_ignore_zeros_neg_ones(series):
    valid_values = series[(series != 0) & (series != -1)]
    if len(valid_values) == 0:
        return np.nan
    return valid_values.mean()

pred_columnIndex=['id', 'name', 'Major', 'gpa', 'math ability','mastery of language', 'specialty core courses', 'lab courses',
           'capstone/design project', 'internship', 'community service',
           'student clubs', 'major int\'l/national competition', 'college of bachelor degree',
             'college of higher degree', 'llm pred score', 'llm pred pass',
             'overall score', 'weight score','score', 'Passed', 'Summary', 'Good_univeristy', 'Good_english', 'study abroad']
not_pred_columnIndex=['id', 'gpa', 'math ability', 'mastery of language',
                      'specialty core courses', 'lab courses', 'capstone/design project',
                      'internship', 'community service', 'student clubs',
                      'major int\'l/national competition', 'college of bachelor degree',
                      'overall score', 'Passed']

merged_data = data
# merged_data['Passed'] = pd.Categorical(merged_data['Passed'], categories=['passed', 'unpassed'])
merged_data_all = merged_data[merged_data.columns[~merged_data.columns.str.contains('description')]]
merged_data_pred = merged_data_all[merged_data_all.columns[~merged_data_all.columns.str.contains('mean')]]
merged_data_pred=merged_data_pred.loc[:,pred_columnIndex]
merged_data_not_pred = merged_data_all[['id']+merged_data_all.columns[merged_data_all.columns.str.contains('mean')].to_list()+['Passed']]
print(merged_data_not_pred.columns)
merged_data_not_pred.columns = [i.replace('mean_','')for i in merged_data_not_pred.columns]
merged_data_not_pred=merged_data_not_pred.loc[:,not_pred_columnIndex]
merged_data_pred_score = merged_data_pred.loc[:, ~merged_data_pred.columns.str.contains('description', case=False)]
# merged_data_score = merged_data_score.drop(["id","name","Major","Summary","Pass pre-shortlist or not"], axis=1)
merged_data_pred_score = merged_data_pred_score.drop(["Major","name","Summary","Good_univeristy","Good_english",'study abroad','score'], axis=1)
# merged_data_pred_score=merged_data_pred_score.loc[:,columnIndex] #reorder column
# print(merged_data_pred_score.info(True))
# merged_data_score["Total Average"] = merged_data_score.drop(['Passed'], axis=1).mean(axis=1)
# Descriptive statistics for numeric data
print("Descriptive Statistics:")
print(merged_data_not_pred.groupby('Passed'))

# mean_values = merged_data_pred_score.groupby('Passed').agg(mean_ignore_zeros_neg_ones).reset_index()
# melted_df = pd.melt(mean_values, id_vars=['Passed'], value_vars=merged_data_pred_score.drop(['Passed', 'id', 'college of higher degree','llm pred pass', 'llm pred score', 'overall score'], axis=1).columns.to_list(),
#                     var_name='Metric', value_name='Mean Value')
mean_values = merged_data_not_pred.groupby('Passed').agg(mean_ignore_zeros_neg_ones).reset_index()
melted_df = pd.melt(mean_values, id_vars=['Passed'], value_vars=merged_data_not_pred.drop(['Passed', 'id'], axis=1).columns.to_list(),
                                        var_name='Metric', value_name='Mean Value')

plt.figure(figsize=(14, 6))
barplot = sns.barplot(data=melted_df, x='Metric', y='Mean Value', hue='Passed')
plt.title('Comparison of Mean Values for Passed vs Unpassed Students')
plt.ylabel('Mean Value')
plt.xlabel('Variables')
# plt.xticks(rotation=45)
plt.legend(title='Passed Status')
plt.ylim(65, 95)
barplot.set_xticklabels(barplot.get_xticklabels(), rotation=20, horizontalalignment='right')


plt.title('Comparison of Mean Values for Passed vs Unpassed Students', fontsize=24)
plt.xlabel('', fontsize=20)  # Larger x-axis label
plt.ylabel('', fontsize=20)  # Larger y-axis label
plt.xticks(fontsize=20)  # Larger x-axis tick labels
plt.yticks(fontsize=20)  # Larger y-axis tick labels
plt.legend(title='Passed Status', title_fontsize='13', fontsize='12',loc='upper left')

barplot.set_facecolor('white')
for spine in barplot.spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(0.5)
    
plt.tight_layout()
# plt.savefig('./figure/Total_real_meanvalue_pvsunp.jpg')
plt.show()

# Histograms for scores



# # Box plots for attendance
# plt.figure(figsize=(12, 6))
# sns.boxplot(x='Passed', y='Attendance', data=df)
# plt.title('Attendance by Pass Status')
# plt.show()

# # Categorical data analysis
# plt.figure(figsize=(12, 6))
# sns.countplot(x='Department', hue='Passed', data=df)
# plt.title('Department Wise Pass Status')
# plt.show()

In [None]:

from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### How to select who pass
# 1. Mean of Rating - "overall score"
# 2. GPT Overall Score - "llm pred score"
# 3. GPT pass or not - "llm pred pass"
# 4. Weight score - "weight score"
selected_column = "weight score"

threshold_index = int(len(merged_data_pred_score) * 0.75)
# threshold_index= 270
# Sort the DataFrame by 'score' in descending order
sorted_scores = merged_data_pred_score.sort_values(by=selected_column, ascending=False)

# Get the score at the threshold index
threshold_score = sorted_scores.iloc[threshold_index][selected_column]
# print(threshold_score)
# threshold = merged_data_pred_score[selected_column].quantile(2/3)
# Set the 'overall score' for the top 2/3 students
merged_data_pred['pred_status'] = merged_data_pred.apply(
    lambda row: row[selected_column] >= threshold_score, axis=1
)
merged_data_pred['status_binary'] = merged_data_pred['score'].apply(lambda x: 1 if x > 0 else 0)

# Calculate accuracy
#accuracy = accuracy_score(merged_data_pred['status_binary'], merged_data_pred['llm pred pass'])
accuracy = accuracy_score(merged_data_pred['status_binary'], merged_data_pred['pred_status'])
print(f'Accuracy: {accuracy}')

# Calculate recall
#recall = recall_score(merged_data_pred['status_binary'], merged_data_pred['llm pred pass'])
recall = recall_score(merged_data_pred['status_binary'], merged_data_pred['pred_status'])
print(f'Recall: {recall}')

#cm = confusion_matrix(merged_data_pred['status_binary'], merged_data_pred['llm pred pass'])
cm = confusion_matrix(merged_data_pred['status_binary'], merged_data_pred['pred_status'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['unpass', 'pass'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
# plt.savefig('./figure/Total_v12_Confusion_Matrix.jpg')
plt.show()

In [None]:
#Detail of students with different result
ind=0
while ind < len(merged_data_pred):
    if merged_data_pred.loc[ind, 'status_binary'] != merged_data_pred.loc[ind,'pred_status']:
        #print("Student ID: "+str(merged_data_not_pred.loc[ind, 'id'])+  ". Mean overall score: "+str(merged_data_not_pred.loc[ind, 'overall score'])+", predict score: "+str(merged_data_pred.loc[ind, 'overall score']))
        print("Student ID: "+str(merged_data_not_pred.loc[ind, 'id'])+  ". True-Pred: "+str(merged_data_not_pred.loc[ind, 'overall score']-merged_data_pred.loc[ind, 'overall score']))
    ind+=1

In [None]:
mismatch=merged_data_all[merged_data_pred['status_binary'] != merged_data_pred['pred_status']]
# mismatch['False Pass']= mismatch['pred_status']==True
mismatch['Dif To Threshold']=mismatch[selected_column]-threshold_score
mismatch

In [None]:
fig, axes = plt.subplots(2, 6, figsize=(45, 12))  # Adjusted figsize for better fit
axes = axes.flatten()  # Flatten the axes array for easy iteration
used_data = merged_data_not_pred 
# merged_data_pred_score, merged_data_not_pred 
# Iterate over each column (excluding 'Passed' which is used for hue)
for i, col in enumerate([c for c in ['gpa', 'math ability','mastery of language', 'specialty core courses', 'lab courses',
                                                               'capstone/design project', 'internship', 'community service',
                                                               'student clubs', 'major int\'l/national competition', 'college of bachelor degree', 'overall score']]):

    # Create histogram in the specified subplot
    ax = axes[i]
    filtered_data = used_data[(used_data[col] != 0) & (used_data[col] != -1)]
    sns.histplot(data=filtered_data, x=col, hue='Passed', element='step', bins=10, ax=ax, hue_order=['passed','unpassed'])
    if col == "major int'l/national competition":
        col = 'competition'
    ax.set_xlabel(col, fontsize=40)
    ax.set_ylabel('', fontsize=20)
    ax.tick_params(labelsize=20)
    ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
    ax.set_facecolor('white')
    # Customize the legend
    leg = ax.get_legend()
    if leg:
        leg.set_title('Pass Status', prop={'size': 24})
        leg.set_frame_on(True)
        # leg.set_bbox_to_anchor((1,1))  # Adjust position of legend
        for text in leg.get_texts():
            text.set_fontsize('20')
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_linewidth(0.5)

# plt.legend(loc='upper left')
# Adjust layout
plt.tight_layout()
# plt.savefig('./figure/Total_pred_summary.jpg')
plt.show()

In [466]:
merged_data_not_pred.columns

Index(['id', 'gpa', 'math ability', 'mastery of language',
       'specialty core courses', 'lab courses', 'capstone/design project',
       'internship', 'community service', 'student clubs',
       'major int'l/national competition', 'college of bachelor degree',
       'overall score', 'Passed'],
      dtype='object')

In [None]:
# data_for_heatmap = merged_data_not_pred.copy(deep=True)
# data_for_heatmap['Passed'] = data_for_heatmap['Passed'] == "passed"
# data_for_heatmap=data_for_heatmap.drop(['id'], axis=1)


data_for_heatmap = merged_data_pred.copy(deep=True)
data_for_heatmap['Passed'] = data_for_heatmap['Passed'] == "passed"

data_for_heatmap = data_for_heatmap.drop([
    'id', 'name', 'Major', 'Summary', 'college of higher degree',
    'Good_univeristy', 'Good_english', 'study abroad', 'score',
    'llm pred pass', 'status_binary', 'weight score', 'llm pred score',
    'pred_status'
], axis=1)

cols = data_for_heatmap.columns.tolist()
data_for_heatmap = data_for_heatmap[cols]

# Calculate the correlation matrix
correlation_matrix = data_for_heatmap.corr()

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(24, 18))
# passed_corr = correlation_matrix[['Passed']].drop('Passed')
# heatmap = sns.heatmap(passed_corr, annot=True, fmt=".2f", cmap='YlGnBu', ax=ax, annot_kws={"fontsize":20})

# Draw the heatmap with the mask and correct aspect ratio
heatmap = sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='YlGnBu', ax=ax, annot_kws={"fontsize":20})
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=30, horizontalalignment='right')
# Customize tick labels for better readability
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.tight_layout()
plt.savefig('./figure/Total_pred_heatmap.jpg')
# Show plot
plt.show()

In [None]:
# Creating a 3x5 grid of subplots for box plots
fig, axes = plt.subplots(2, 6, figsize=(45, 12))  # Adjusted figsize for better fit
axes = axes.flatten()  # Flatten the axes array for easy iteration
# used_data = merged_data_not_pred
used_data = merged_data_pred_score
# merged_data_not_pred, merged_data_pred_score
filtered_data = used_data[(used_data != 0) & (used_data != -1)].dropna()
# print(filtered_data)
# Plotting box plots for each numerical column, excluding 'Passed' which is used for hue
# for i, col in enumerate([c for c in merged_data_pred_score.columns if c not in ['Passed', 'id','score','referees feedbacks', 'college of higher degree']]):
for i, col in enumerate([c for c in used_data.columns if c not in ['Passed', 'id','score','referees feedbacks', 'college of higher degree']]):
   
    ax = axes[i]
    filtered_data = used_data[(used_data[col] != 0) & (used_data[col] != -1)]
    sns.boxplot(data=filtered_data, x='Passed', y=col, ax=ax, palette="Blues", order=['passed','unpassed'])
    if col == "major int'l/national competition":
        col = 'competition'
    ax.set_xlabel(col, fontsize=40)
    ax.set_ylabel('', fontsize=20)
    ax.tick_params(labelsize=20)
    ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
    # Customize the legend
    leg = ax.get_legend()
    if leg:
        leg.set_title('Pass Status', prop={'size': 24})
        leg.set_frame_on(True)
        # leg.set_bbox_to_anchor((1, 1))  # Adjust position of legend
        for text in leg.get_texts():
            text.set_fontsize('20')
# Adjust layout
plt.tight_layout()
#plt.savefig('./figure/pred_boxplot.pdf')
plt.show()


In [None]:
fig, axes = plt.subplots(3, 5, figsize=(30, 18))  # Adjusted figsize for better fit
axes = axes.flatten()  # Flatten the axes array for easy iteration

real_data = merged_data_not_pred
pred_data = merged_data_pred_score
# merged_data_not_pred, merged_data_pred_score
exclude_cols = ['Passed', 'id','score','referees feedbacks', 'college of higher degree',"llm pred score","llm pred pass","weight score"]
diff_data = pd.DataFrame()
for col in pred_data.columns:
    if col not in exclude_cols:
        # valid_indices = (real_data[col] != 0) & (real_data[col] != -1) & (pred_data[col] != 0) & (pred_data[col] != -1)
        valid_indices = (real_data[col] != 0) & (real_data[col] != -1) 
        diff_data[col] = real_data.loc[valid_indices, col] - pred_data.loc[valid_indices, col]
        # diff_data[col] = real_data.loc[:, col] - pred_data.loc[:, col]
# Plotting box plots for each numerical column, excluding 'Passed' which is used for hue
# for i, col in enumerate([c for c in merged_data_pred_score.columns if c not in ['Passed', 'id','score','referees feedbacks', 'college of higher degree']]):
for i, col in enumerate(diff_data.columns):
    sns.histplot(diff_data[col].dropna().abs(), bins=8, kde=True, ax=axes[i], color="lightblue")
    axes[i].set_xlabel(col, fontsize=20)
    axes[i].set_ylabel('Num of Students', fontsize=20)
    axes[i].tick_params(labelsize=20)
    axes[i].yaxis.set_major_locator(plt.MaxNLocator(integer=True))
    axes[i].set_ylim(0, 320)
# Adjust layout
plt.tight_layout()
# plt.savefig('./figure/Total_v12_diff_boxplot.jpg')
plt.show()