# **Single major**

In [None]:
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np

classification_file = '/Users/gahyoungim/Library/CloudStorage/GoogleDrive-dotchgahyoun@gmail.com/내 드라이브/STEM hub-centrality/data/Subject Projected Network/resolution=0.00_community_1_projected_표준분류소계열_stats.xlsx'
stem_data_path = '/Users/gahyoungim/Library/CloudStorage/GoogleDrive-dotchgahyoun@gmail.com/내 드라이브/STEM hub-centrality/data/교육과정_대학(20240305)_preprocessed_STEM_fin_with_groups_final.xlsx'

# Load Standard Classification (lower) Series data
classification_data = pd.read_excel(classification_file)
classification_data = classification_data[classification_data['표준분류소계열'] != 'N.C.E.']  # NCE 제외

# STEM data load
stem_data = pd.read_excel(stem_data_path)
stem_data['학교별 학과명'] = stem_data['학교명'] + ' ' + stem_data['학부·과(전공)명']

# a list of taxonomy sublines to filter
classification_list = classification_data['표준분류소계열'].unique()

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import re
from scipy.spatial.distance import jensenshannon
import random

# Load and preprocess data
stem_data = pd.read_excel(stem_data_path)
stem_data['학교별 학과명'] = stem_data['학교명'] + ' ' + stem_data['학부·과(전공)명']
stem_data['학점'] = stem_data['학점'].fillna(3)

# Theme string normalisation function
def normalize_topic(topic):
    return re.sub(r'\d+$', '', topic).strip().lower()

# List of banned words
excluded_words = {"none", "junior seminars", "career", "entrepreneurship", "internship",
                  "fieldwork", "capstone", "writing", "field trip", "project-based learning"}

# Initial seed subject selection function
def determine_initial_courses_single(subfield_data):
    unique_credit_sums = subfield_data.groupby(['학교별 학과명', 'Cluster Topic'])['학점'].sum().reset_index()
    topic_frequency = unique_credit_sums['Cluster Topic'].value_counts()
    mode_credit_sums = unique_credit_sums.groupby('Cluster Topic')['학점'].agg(lambda x: x.mode()[0] if not x.mode().empty else 3)

    selected_topics = {}
    selected_normalized_topics = set()

    for topic, _ in topic_frequency.items():
        normalized_topic = normalize_topic(topic)
        if normalized_topic not in selected_normalized_topics and topic in mode_credit_sums:
            if not any(excluded in normalized_topic for excluded in excluded_words):
                selected_topics[topic] = mode_credit_sums[topic]
                selected_normalized_topics.add(normalized_topic)
        if len(selected_topics) >= 5:
            break
    return selected_topics, mode_credit_sums

# generate bipartite graph function
def create_bipartite_graph(subfield_data):
    B = nx.Graph()
    for _, row in subfield_data.iterrows():
        school_dept = f"학교-{row['학교별 학과명']}"
        cluster_topic = row['Cluster Topic']
        credits = row['학점']
        B.add_edge(school_dept, cluster_topic, weight=credits)
    return B

# Project function to Cluster Topic
def project_cluster_topic_network(B):
    cluster_topic_nodes = {n for n in B.nodes() if not n.startswith("학교-")}
    return nx.bipartite.weighted_projected_graph(B, cluster_topic_nodes)

# JSD calculation function
def calculate_jsd(distribution1, distribution2):
    jsd = jensenshannon(distribution1, distribution2, base=2)
    return jsd**2

# Generate subject distribution function
def create_distribution_vector(courses, all_courses):
    vector = np.zeros(len(all_courses))
    for i, course in enumerate(all_courses):
        vector[i] = courses.get(course, 0)
    return vector / vector.sum() if vector.sum() != 0 else vector

# Calculate network-wide average JSD functio
def calculate_average_jsd(selected_topics, subfield_data):
    all_courses = subfield_data['Cluster Topic'].unique()
    selected_distribution = create_distribution_vector(selected_topics, all_courses)
    jsd_scores = []
    for dept_name in subfield_data['학교별 학과명'].unique():
        dept_courses = subfield_data[subfield_data['학교별 학과명'] == dept_name]
        dept_distribution = create_distribution_vector(dept_courses.set_index('Cluster Topic')['학점'].to_dict(), all_courses)
        jsd_scores.append(calculate_jsd(selected_distribution, dept_distribution))
    return np.mean(jsd_scores)

# Single-mode only optimal result generating function
def optimized_combination_single(subfield):

    subfield_data = stem_data[stem_data['표준분류소계열'] == subfield]
    initial_courses, mode_credit_sums = determine_initial_courses_single(subfield_data)
    selected_topics = initial_courses.copy()
    selected_normalized_topics = {normalize_topic(topic) for topic in selected_topics}

    B = create_bipartite_graph(subfield_data)
    cluster_topic_projection = project_cluster_topic_network(B)

    total_credits = sum(selected_topics.values())
    average_sum_of_credits = subfield_data.groupby('학교별 학과명')['학점'].sum().mean()
    jsd_ignored_once = False  # flags to ignore JSD and allow adding a subject only once

    # Add subject function
    def add_course(projection, mode_credit_sums, subfield_data, selected_topics, ignore_jsd=False):
        best_jsd, best_topic, best_weight_sum = calculate_average_jsd(selected_topics, subfield_data), None, 0

        for topic in projection.nodes:
            normalized_topic = normalize_topic(topic)
            if normalized_topic in selected_normalized_topics or any(excluded in normalized_topic for excluded in excluded_words):
                continue

            weight_sum = sum(projection[topic][neighbor]['weight'] for neighbor in selected_topics if projection.has_edge(topic, neighbor))
            temp_selected_topics = selected_topics.copy()
            temp_selected_topics[topic] = mode_credit_sums.get(topic, 3)
            new_jsd = calculate_average_jsd(temp_selected_topics, subfield_data)

            # JSD check logic depends on the ignore_jsd flag
            if (ignore_jsd or new_jsd < best_jsd) and weight_sum > best_weight_sum:
                best_jsd, best_topic, best_weight_sum = new_jsd, topic, weight_sum

        if best_topic:
            selected_topics[best_topic] = mode_credit_sums.get(best_topic, 3)
            selected_normalized_topics.add(normalize_topic(best_topic))
            print(f"Added course '{best_topic}' with weight sum {best_weight_sum} and updated JSD: {best_jsd} (Ignore JSD: {ignore_jsd})")
            return True
        return False

    # Loop: run until credit requirements are met
    while total_credits < min(0.85 * average_sum_of_credits, 85):
        if total_credits > min(0.85 * average_sum_of_credits, 85):
            break
        # attempt to add a subject taking JSD into account
        added = add_course(cluster_topic_projection, mode_credit_sums, subfield_data, selected_topics)

        # If no subjects have been added, credits are less than 0.5*average credits, and the JSD ignore add has never been executed
        if not added and total_credits < 0.5 * average_sum_of_credits and not jsd_ignored_once:
            # Add a subject by allowing JSD override only once
            added = add_course(cluster_topic_projection, mode_credit_sums, subfield_data, selected_topics, ignore_jsd=True)
            jsd_ignored_once = True  # set the flag to run only once

        # exit the loop if there are no subjects to add
        if not added:
            print("No further course addition possible. Exiting loop.")
            break


        total_credits = sum(selected_topics.values())

    final_topics_df = pd.DataFrame({
        "Course": list(selected_topics.keys()),
        "Credits": list(selected_topics.values())
    })

    print(final_topics_df)
    print(f"Final Total Credits: {total_credits}")
    return final_topics_df

In [None]:
import itertools
import pandas as pd

# Create a list of series sorted in Korean order except for ‘N.C.E.’
sorted_subfields = sorted([subfield for subfield in stem_data['표준분류소계열'].unique() if subfield != "N.C.E."])

# Calculate the optimised combinations for each series and save the results to an Excel file
for target_subfield in sorted_subfields:
    print(f"Processing subfield: {target_subfield}")

    # Calculate the optimised combinations for each series and save the results to an Excel file
    result_df = optimized_combination_single(target_subfield)

    # Save the results to an Excel file
    filename = f'/Users/gahyoungim/Library/CloudStorage/GoogleDrive-dotchgahyoun@gmail.com/내 드라이브/Curriculum Analysis Project/K-STEM Curriculum Network Analysis/Interdisciplinary Majors/single major jsd base/{target_subfield}_single_major.xlsx'
    with pd.ExcelWriter(filename) as writer:
        result_df.to_excel(writer, sheet_name="Courses and Credits", index=False)

    print(f"Saved {filename}")

# **Double major**

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import re
from scipy.spatial.distance import jensenshannon
import random
import itertools

# Load and preprocess data
stem_data = pd.read_excel('/home/hedgehog/Interdisciplinary Majors/교육과정_대학(20240305)_preprocessed_STEM_fin_with_groups_final.xlsx')
stem_data['학교별 학과명'] = stem_data['학교명'] + ' ' + stem_data['학부·과(전공)명']
stem_data['학점'] = stem_data['학점'].fillna(3)

# Theme string normalisation function
def normalize_topic(topic):
    if pd.isna(topic):   # Prevent NaN
        return ''
    return re.sub(r'\d+$', '', str(topic)).strip().lower()

# List of banned words
excluded_words = {"none", "junior seminars", "career", "entrepreneurship", "internship",
                  "fieldwork", "capstone", "writing", "field trip", "project-based learning"}

# Initial subject selection function
def determine_initial_courses_single(subfield_data):
    if subfield_data.empty:  # Handle empty dataframes
        return {}, pd.Series(dtype=float)

    unique_credit_sums = subfield_data.groupby(['학교별 학과명', 'Cluster Topic'])['학점'].sum().reset_index()
    topic_frequency = unique_credit_sums['Cluster Topic'].value_counts()

    # Process empty data
    mode_credit_sums = unique_credit_sums.groupby('Cluster Topic')['학점'].agg(
        lambda x: x.mode()[0] if not x.mode().empty else 3  # Prevent empty arrays
    )

    selected_topics = {}
    selected_normalized_topics = set()

    for topic, _ in topic_frequency.items():
        normalized_topic = normalize_topic(topic)
        if normalized_topic not in selected_normalized_topics and topic in mode_credit_sums:
            if not any(excluded in normalized_topic for excluded in excluded_words):
                selected_topics[topic] = mode_credit_sums[topic]
                selected_normalized_topics.add(normalized_topic)
        if len(selected_topics) >= 5:
            break

    return selected_topics, mode_credit_sums

# Calculate network-wide average JSD function
def calculate_average_jsd(selected_topics, subfield_data):
    all_courses = subfield_data['Cluster Topic'].unique()
    selected_distribution = create_distribution_vector(selected_topics, all_courses)
    jsd_scores = []

    for dept_name in subfield_data['학교별 학과명'].unique():
        dept_courses = subfield_data[subfield_data['학교별 학과명'] == dept_name]
        dept_distribution = create_distribution_vector(
            dept_courses.groupby('Cluster Topic')['학점'].sum().to_dict(),
            all_courses
        )
        jsd_scores.append(calculate_jsd(selected_distribution, dept_distribution))

    return np.mean(jsd_scores)

# generate bipartite graph function
def create_bipartite_graph(subfield_data):
    B = nx.Graph()
    for _, row in subfield_data.iterrows():
        school_dept = f"학교-{row['학교별 학과명']}"
        cluster_topic = row['Cluster Topic']
        credits = row['학점']
        B.add_edge(school_dept, cluster_topic, weight=credits)
    return B

# Project function to Cluster Topic
def project_cluster_topic_network(B):
    cluster_topic_nodes = {n for n in B.nodes() if not n.startswith("학교-")}
    return nx.bipartite.weighted_projected_graph(B, cluster_topic_nodes)

# JSD calculation function
def calculate_jsd(distribution1, distribution2):
    # Handle if distribution is empty
    if distribution1.sum() == 0 or distribution2.sum() == 0:
        return 1.0  # Return the maximum distance (if the distribution is non-normal)
    jsd = jensenshannon(distribution1, distribution2, base=2)
    return jsd**2


# Generate subject distribution function
def create_distribution_vector(courses, all_courses):
    vector = np.zeros(len(all_courses))
    for i, course in enumerate(all_courses):
        vector[i] = courses.get(course, 0)
    total_sum = vector.sum()
    return vector / total_sum if total_sum != 0 else vector  # return as is if denominator is zero


# Calculate network-wide average JSD function
def calculate_average_jsd(selected_topics, subfield_data):
    all_courses = subfield_data['Cluster Topic'].unique()
    selected_distribution = create_distribution_vector(selected_topics, all_courses)
    jsd_scores = []
    for dept_name in subfield_data['학교별 학과명'].unique():
        dept_courses = subfield_data[subfield_data['학교별 학과명'] == dept_name]
        dept_distribution = create_distribution_vector(dept_courses.set_index('Cluster Topic')['학점'].to_dict(), all_courses)
        jsd_scores.append(calculate_jsd(selected_distribution, dept_distribution))
    return np.mean(jsd_scores)

# single-mode only optimisation function
def optimized_combination_single(subfield):

    subfield_data = stem_data[stem_data['표준분류소계열'] == subfield]
    initial_courses, mode_credit_sums = determine_initial_courses_single(subfield_data)
    selected_topics = initial_courses.copy()
    selected_normalized_topics = {normalize_topic(topic) for topic in selected_topics}

    B = create_bipartite_graph(subfield_data)
    cluster_topic_projection = project_cluster_topic_network(B)

    total_credits = sum(selected_topics.values())
    average_sum_of_credits = subfield_data.groupby('학교별 학과명')['학점'].sum().mean()
    jsd_ignored_once = False  # flags to ignore JSD and allow adding a subject only once

    # Add subject function
    def add_course(projection, mode_credit_sums, subfield_data, selected_topics, ignore_jsd=False):
        best_jsd, best_topic, best_weight_sum = calculate_average_jsd(selected_topics, subfield_data), None, 0

        for topic in projection.nodes:
            normalized_topic = normalize_topic(topic)
            if normalized_topic in selected_normalized_topics or any(excluded in normalized_topic for excluded in excluded_words):
                continue

            # Calculate the sum of the weights
            weight_sum = sum(
                projection[topic][neighbor]['weight'] for neighbor in selected_topics if projection.has_edge(topic, neighbor)
            )
            temp_selected_topics = selected_topics.copy()
            temp_selected_topics[topic] = mode_credit_sums.get(topic, 3)
            new_jsd = calculate_average_jsd(temp_selected_topics, subfield_data)

            if (ignore_jsd or new_jsd < best_jsd) and weight_sum > best_weight_sum:
                best_jsd, best_topic, best_weight_sum = new_jsd, topic, weight_sum

        if best_topic:
            selected_topics[best_topic] = mode_credit_sums.get(best_topic, 3)
            selected_normalized_topics.add(normalize_topic(best_topic))
            print(f"Added course '{best_topic}' with weight sum {best_weight_sum} and updated JSD: {best_jsd} (Ignore JSD: {ignore_jsd})")
            return True
        return False

    # Loop: run until credit requirements are met
    while total_credits < min(0.85 * average_sum_of_credits, 85):
        if total_credits > min(0.85 * average_sum_of_credits, 85):
            break
        # attempt to add a subject taking JSD into account
        added = add_course(cluster_topic_projection, mode_credit_sums, subfield_data, selected_topics)

        # if no subjects have been added, credits are less than 0.5*average credits, and the JSD ignore add has never been executed
        if not added and total_credits < 0.5 * average_sum_of_credits and not jsd_ignored_once:
            # Add a subject by allowing JSD override only once
            added = add_course(cluster_topic_projection, mode_credit_sums, subfield_data, selected_topics, ignore_jsd=True)
            jsd_ignored_once = True  # set the flag to run only once

        # exit the loop if there are no subjects to add
        if not added:
            print("No further course addition possible. Exiting loop.")
            break


        total_credits = sum(selected_topics.values())

    final_topics_df = pd.DataFrame({
        "Course": list(selected_topics.keys()),
        "Credits": list(selected_topics.values())
    })

    print(final_topics_df)
    print(f"Final Total Credits: {total_credits}")
    return final_topics_df

def optimized_combination_double(subfield1, subfield2):
    random.seed(42)  # set random seed

    # Extract two series data
    subfield_data1 = stem_data[stem_data['표준분류소계열'] == subfield1]
    subfield_data2 = stem_data[stem_data['표준분류소계열'] == subfield2]
    combined_data = pd.concat([subfield_data1, subfield_data2], ignore_index=True)

    # Select initial seed courses (5 from each department)
    initial_courses1, mode_credit_sums = determine_initial_courses_single(subfield_data1)
    initial_courses2, _ = determine_initial_courses_single(subfield_data2)
    selected_topics = {**initial_courses1, **initial_courses2}  # 초기 과목 통합
    selected_normalized_topics = {normalize_topic(topic) for topic in selected_topics}

    # Create a biplot based on the entire data
    B = create_bipartite_graph(combined_data)
    cluster_topic_projection = project_cluster_topic_network(B)

    # Calculate credits and averages
    total_credits = sum(selected_topics.values())
    avg_credits1 = subfield_data1.groupby('학교별 학과명')['학점'].sum().mean()
    avg_credits2 = subfield_data2.groupby('학교별 학과명')['학점'].sum().mean()
    jsd_ignored_once = False

    # Add subject function
    def add_course_to_double(ignore_jsd=False):
        best_jsd, best_topic = float('inf'), None

        for topic in cluster_topic_projection.nodes:
            normalized_topic = normalize_topic(topic)
            if (
                normalized_topic in selected_normalized_topics
                or any(excluded in normalized_topic for excluded in excluded_words)
            ):
                continue

            # Calculate JSD average after adding temporary subjects
            temp_selected_topics = selected_topics.copy()
            temp_selected_topics[topic] = mode_credit_sums.get(topic, 3)
            jsd1 = calculate_average_jsd(temp_selected_topics, subfield_data1)
            jsd2 = calculate_average_jsd(temp_selected_topics, subfield_data2)
            avg_jsd = (jsd1 + jsd2) / 2

            # Update candidate
            if ignore_jsd or avg_jsd < best_jsd:
                best_jsd, best_topic = avg_jsd, topic

        if best_topic:
            selected_topics[best_topic] = mode_credit_sums.get(best_topic, 3)
            selected_normalized_topics.add(normalize_topic(best_topic))
            print(f"Added course '{best_topic}' with new JSD: {best_jsd} (Ignore JSD: {ignore_jsd})")
            return True
        return False

    # Loop: run until credit requirement is met
    while total_credits < min(0.85 * np.average(avg_credits1 + avg_credits2), 85):
        if total_credits > min(0.85 * np.average(avg_credits1 + avg_credits2), 85):
            break

        # Add subjects taking JSD into account
        added = add_course_to_double()

        # call single if there are no candidates
        if not added:
            print("No valid double candidate. Adding single-mode courses.")
            optimized_combination_single(subfield1)
            optimized_combination_single(subfield2)
            continue  # Return to Double mode

        # Update credits
        total_credits = sum(selected_topics.values())

    # Output the final result
    final_topics_df = pd.DataFrame({"Course": list(selected_topics.keys()), "Credits": list(selected_topics.values())})
    print("Final Topics for Combined Subfields:")
    print(final_topics_df)
    print(f"Total Credits: {total_credits}")

    return final_topics_df


# sort all minor series in Korean alphabetical order (except N.C.E.)
all_subfields = sorted([subfield for subfield in stem_data['표준분류소계열'].unique() if subfield != "N.C.E."])

# Generate 2 possible combinations from all subsequences
combinations = list(itertools.combinations(all_subfields, 2))


print(f"Processing combination: {subfield1} & {subfield2}")
result_df = optimized_combination_double(subfield1, subfield2)
filename = f"/home/hedgehog/Interdisciplinary Majors/double_major/{subfield1}_{subfield2}_double_major.xlsx"
result_df.to_excel(filename, index=False)
print(f"Saved {filename}")