In [1]:
import numpy as np

In [2]:
# hyperparameters #
K = 100    # dim of latent level
N_topic_p_sub = 10    # dim of non-zero in sub latent level
N_stu = 50000         # Num of students
N_semaster = 12            # Num of semesters
N_cou_p_stu_p_sem = np.array([4,5,6,7]) # possible number of courses taken by one student per semester
in_sub = 0.8        # student take courses in her sub


In [3]:
# load real courses data #
import cPickle
file_real_course = "./data/courses_crawled_by_major"
data_real_course = cPickle.load(open(file_real_course, "r"))
print len(data_real_course)
print data_real_course.keys()
print data_real_course["Korean (KOR)"]

238
[u'Korean (KOR)', u'Environmental Science (ENVSC)', u'Recreation, Park, and Tourism Management (RPTM)', u'Agricultural Science (AGSC)', u'Security and Risk Analysis (SRA)', u'American Studies (AMST)', u'Graphic Design (GD)', u'Occupational Therapy (OT)', u'Education (EDUC)', u'Communications (COMM)', u'School of Science Engineering and Technology (SSET)', u'Engineering (ENGR)', u'Science (SC)', u'Criminal Justice (CRIMJ)', u'Biorenewable Systems (BRS)', u'Latin (LATIN)', u'Physical Therapy (PT)', u'International Agriculture (INTAG)', u'Nuclear Engineering (NUCE)', u'Chemical Engineering (CHE)', u'Anthropology (ANTH)', u'Arts and Architecture Administrative (AAADM)', u'Mathematics (MATH)', u'Comparative Literature (CMLIT)', u'Finance (FIN)', u'Petroleum and Natural Gas Engineering (PNG)', u'Business Law (BLAW)', u'Arts and Architecture (AA)', u'Swahili (SWA)', u'Humanities and Social Sciences (HSS)', u'Japanese (JAPNS)', u'Industrial Engineering Technology (IET)', u'English as a Sec

In [4]:
# Synthesize Latent level #

def level_generate(N_sparse, N_range=K, scale=1.0, sparse_return=False):
    ind = np.random.choice(N_range, N_sparse, replace=False)
    dist = np.random.random(N_sparse)
    dist = dist/np.max(dist)
    level = np.zeros(N_range)
    level[ind] = scale * dist
    if sparse_return:
        return ind, dist*scale
    else:
        return level

In [5]:
# sub-latent distribution #
for sub_name in data_real_course:
    data_real_course[sub_name]["level"] = level_generate(N_topic_p_sub, sparse_return=True)
print data_real_course['Korean (KOR)']["level"]

(array([ 0, 71, 33, 80, 85, 25, 88, 57, 37, 29]), array([ 0.394271  ,  0.92088794,  0.60890168,  0.69639774,  0.55481966,
        0.54539037,  0.82607731,  0.56352906,  0.66974169,  1.        ]))


In [6]:
def courses_level_generate(ind, dist, N_cou, N_range=K, start=0.01):
    level_scale = np.sort(np.random.random(N_cou)*(1-start) + start)
    level_scale = level_scale / np.max(level_scale)
    courses_level = []
    for cou in range(N_cou):
        noise = np.random.random(ind.shape[0])
        noise = noise/np.max(noise)
        cou_level = np.zeros(N_range)
        cou_level[ind] = level_scale[cou] * dist * noise
        courses_level.append(cou_level)
    return courses_level

In [7]:
for sub_name in data_real_course:
    courses_list = data_real_course[sub_name]["courses"]
    courses_list = [[cou_name, courses_list[cou_name]["num"]] for cou_name in courses_list]
    courses_list = sorted(courses_list, key=lambda x: x[1])
    data_real_course[sub_name]["courses_list_ordered"] = map(lambda x: x[0], courses_list)
    
    sub_level = data_real_course[sub_name]["level"]
    courses_level = courses_level_generate(ind=sub_level[0], dist=sub_level[1], N_cou=len(courses_list))
    
    for i in range(len(courses_level)):
        cou_name = courses_list[i][0]
        data_real_course[sub_name]["courses"][cou_name]["level"] = courses_level[i]
print data_real_course["Korean (KOR)"]

{'url': u'/university-course-descriptions/undergraduate/kor/', 'courses': {u'KOR424': {'prereq_courses': [u'KOR120', u'KOR121', u'ASIA100', u'ASIA102', u'ASIA004', u'CMLIT004'], 'major': u'KOR', 'description': u'\n\t\t\t\tExploration of seminal Korean texts, including poetry, fiction, autobiography, and criticism, from the early twentieth century to the contemporary era. This course provides a comprehensive overview of modern Korean literature within a transnational context. As we learn how to critically analyze seminal Korean texts, we will locate them in the social, political, economic, and cultural conditions under which they were produced and received. In grappling with some of the fundamental issues they raise;including colonialism, migration, national division, war, gender relations, developmentalism, urbanization, democratization, and contemporary consumer culture;we will also seek to situate these writings in the Korean vernacular within the larger context of global modernity. 

In [8]:
# Synthesize Students #
stu_dict = {}
subs_list = data_real_course.keys()
N_sub = len(subs_list)
for i in range(N_stu):
    sub_id = np.random.randint(N_sub)
    sub_name = subs_list[sub_id]
    stu_dict[i] = {"sub": sub_name}
    data_real_course[sub_name].setdefault("N_stu", 0)
    data_real_course[sub_name]["N_stu"] += 1
    
    stu_dict[i]["level"] = [level_generate(N_sparse=K, N_range=K, scale=0.2)]
print stu_dict[0]

{'sub': u'Geography (GEOG)', 'level': [array([ 0.00898088,  0.14823021,  0.17040438,  0.08990204,  0.00110816,
        0.1731696 ,  0.00266043,  0.02256423,  0.19370038,  0.00187357,
        0.0905741 ,  0.04309378,  0.06836085,  0.03513546,  0.01766123,
        0.01730429,  0.05302066,  0.05222098,  0.13863535,  0.17195675,
        0.11053703,  0.16247882,  0.08733896,  0.02371077,  0.03655564,
        0.07796913,  0.06134286,  0.15228704,  0.15495693,  0.00713681,
        0.06921099,  0.10159676,  0.06974416,  0.19670656,  0.06819331,
        0.04299711,  0.2       ,  0.11988428,  0.07771742,  0.04322466,
        0.04382109,  0.13049922,  0.0536835 ,  0.03561118,  0.01801422,
        0.19019254,  0.0605558 ,  0.04701785,  0.1958277 ,  0.17900365,
        0.1217917 ,  0.0791968 ,  0.12187358,  0.05649097,  0.17333939,
        0.10653765,  0.07054044,  0.11728074,  0.03621689,  0.07336477,
        0.11838328,  0.11246937,  0.07868593,  0.07119149,  0.02288742,
        0.15564257,  0.08

In [9]:
# synthesize enrollment info #
enroll_stu = stu_dict
for stu in enroll_stu:
    enroll_stu[stu]["cou_taken"] = [[] for i in range(N_semaster)]
    enroll_stu[stu]["grade"] = [[] for i in range(N_semaster)]
print enroll_stu[0]

{'grade': [[], [], [], [], [], [], [], [], [], [], [], []], 'cou_taken': [[], [], [], [], [], [], [], [], [], [], [], []], 'sub': u'Geography (GEOG)', 'level': [array([ 0.00898088,  0.14823021,  0.17040438,  0.08990204,  0.00110816,
        0.1731696 ,  0.00266043,  0.02256423,  0.19370038,  0.00187357,
        0.0905741 ,  0.04309378,  0.06836085,  0.03513546,  0.01766123,
        0.01730429,  0.05302066,  0.05222098,  0.13863535,  0.17195675,
        0.11053703,  0.16247882,  0.08733896,  0.02371077,  0.03655564,
        0.07796913,  0.06134286,  0.15228704,  0.15495693,  0.00713681,
        0.06921099,  0.10159676,  0.06974416,  0.19670656,  0.06819331,
        0.04299711,  0.2       ,  0.11988428,  0.07771742,  0.04322466,
        0.04382109,  0.13049922,  0.0536835 ,  0.03561118,  0.01801422,
        0.19019254,  0.0605558 ,  0.04701785,  0.1958277 ,  0.17900365,
        0.1217917 ,  0.0791968 ,  0.12187358,  0.05649097,  0.17333939,
        0.10653765,  0.07054044,  0.11728074,  

In [10]:
# concerning course level - semester taken bias
SCALE_COU_SEM = 0.2
for sub in data_real_course:
    N_cou_p_sub = len(data_real_course[sub]["courses_list_ordered"])
    cou_sem_center = np.zeros([N_semaster, N_cou_p_sub])
    for i_sem in range(N_semaster):
        for i_cou in range(cou_sem_center.shape[1]):
            cou_sem_center[i_sem, i_cou] = np.exp(- np.power((i_sem * 1.0 / N_semaster - i_cou * 1.0 / N_cou_p_sub), 2.0) 
                                                  / SCALE_COU_SEM)
    data_real_course[sub]["cou_sem_center"] = cou_sem_center
print data_real_course["Korean (KOR)"]

{'N_stu': 195, 'courses_list_ordered': [u'KOR120', u'KOR121', u'KOR424', u'KOR425', u'KOR494H', u'KOR099'], 'url': u'/university-course-descriptions/undergraduate/kor/', 'level': (array([ 0, 71, 33, 80, 85, 25, 88, 57, 37, 29]), array([ 0.394271  ,  0.92088794,  0.60890168,  0.69639774,  0.55481966,
        0.54539037,  0.82607731,  0.56352906,  0.66974169,  1.        ])), 'courses': {u'KOR424': {'prereq_courses': [u'KOR120', u'KOR121', u'ASIA100', u'ASIA102', u'ASIA004', u'CMLIT004'], 'major': u'KOR', 'description': u'\n\t\t\t\tExploration of seminal Korean texts, including poetry, fiction, autobiography, and criticism, from the early twentieth century to the contemporary era. This course provides a comprehensive overview of modern Korean literature within a transnational context. As we learn how to critically analyze seminal Korean texts, we will locate them in the social, political, economic, and cultural conditions under which they were produced and received. In grappling with some

In [11]:
cou_dict = {}
for sub in data_real_course:
    for cou_name in data_real_course[sub]["courses"]:
        cou_dict[cou_name] = data_real_course[sub]["courses"][cou_name]
        cou_dict[cou_name]["sub"] = sub

        if sub == "Korean (KOR)":
            print cou_name

def cou_taken_generate_sub(i_sem, i_sub, sub_dict, cou_taken, N_sample):
    try:
        cou_list = sub_dict[i_sub]["courses"].keys()
    except:
        print i_sub
        print sub_dict[i_sub]
    cou_list_remain = list(set(cou_list) - set(sum(cou_taken[:i_sem], [])))
    if len(cou_list_remain) == 0:
        return []
    cou_sem_center = sub_dict[i_sub]["cou_sem_center"]
#     print cou_list_remain
    cou_dist_remain = cou_sem_center[i_sem, map(sub_dict[i_sub]["courses_list_ordered"].index, cou_list_remain)]
    cou_dist_remain = cou_dist_remain / np.sum(cou_dist_remain)
    cou_taken_sub = np.random.choice(cou_list_remain, min([N_sample, len(cou_list_remain)]), p=cou_dist_remain, replace=False).tolist()
    return cou_taken_sub

def cou_taken_generate(i_sem, cou_taken, sub, sub_dict, N_cou_range=N_cou_p_stu_p_sem):
    N_cou = np.random.choice(N_cou_range.tolist(), 1)
    N_cou_dist = np.random.multinomial(N_cou, [in_sub, 1.0 - in_sub])
    N_cou_sub = N_cou_dist[0]
    N_cou_out = N_cou_dist[1]
    

    # in_sub course sampling #
    cou_taken_sub = cou_taken_generate_sub(i_sem, sub, sub_dict, cou_taken, N_sample=N_cou_sub)
    
    # out sub course sampling #
    cou_taken_out = []
    for i in range(N_cou_out):
        i_sub = np.random.choice(sub_dict.keys(), 1)[0]
        cou_taken_out += cou_taken_generate_sub(i_sem, i_sub, sub_dict, cou_taken, N_sample=1)
    
    return cou_taken_sub + cou_taken_out

def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))

def grade_generate_single_cou(cid, stu_level, cou_dict=cou_dict, learning_noise_scale=0.5, learning_noise_low=0.3, diverse_scale=1):
    cou_level = cou_dict[cid]["level"]
    learning_noise_level = (np.random.random() - learning_noise_low) * learning_noise_scale * cou_level
    
    grade_level = (stu_level + learning_noise_level -cou_level).clip(max=0.0)
#     print grade_level
    grade = sigmoid(np.sum((stu_level + learning_noise_level - cou_level).clip(max=0.0) * diverse_scale)) * 200
    return grade

# test #
stu = np.random.randint(N_stu)
print stu, grade_generate_single_cou("KOR120", enroll_stu[stu]["level"][0])

def grade_generate(i_sem, stu, enroll_stu, cou_dict, cou_sem_assigned=None):
    cou_sem = enroll_stu[stu]["cou_taken"][i_sem]
    if cou_sem_assigned is not None:
        cou_sem = cou_sem_assigned
    N_cou_sem = len(cou_sem)
    grade_sem = [-1 for i in range(N_cou_sem)]
    stu_level = enroll_stu[stu]["level"][i_sem]
    for i_cou in range(N_cou_sem):
        grade_sem[i_cou] = grade_generate_single_cou(cou_sem[i_cou], stu_level)
    return grade_sem

def level_update(i_sem, cou_taken_sem, grade_sem, level_last, cou_dict=cou_dict):
#     cou_taken = enroll_stu[stu]["cou_taken"][i_sem]
#     grade = enroll_stu[stu]["grade"][i_sem]
#     level_last = enroll_stu[stu][level][i_sem]
    level_new = level_last.copy()
    for i_cou in range(len(cou_taken_sem)):
        cou_level = cou_dict[cou_taken_sem[i_cou]]["level"]
        level_new = np.max(np.array([level_new, cou_level * grade_sem[i_cou] / 100]), axis=0)
    return level_new

KOR424
KOR425
KOR099
KOR494H
KOR121
KOR120
49872 97.8146868612


In [12]:
sub_dict = data_real_course
for i_sem in range(N_semaster):
    for stu in range(N_stu):
        stu_enroll = enroll_stu[stu]
        enroll_stu[stu]["cou_taken"][i_sem] = cou_taken_generate(i_sem, stu_enroll["cou_taken"], stu_enroll["sub"], sub_dict)
        enroll_stu[stu]["grade"][i_sem] = grade_generate(i_sem, stu, enroll_stu, cou_dict)
        if len(enroll_stu[stu]["level"]) > (i_sem + 1):
            # update #
            enroll_stu[stu]["level"][i_sem + 1] = level_update(i_sem, stu_enroll["cou_taken"][i_sem], stu_enroll["grade"][i_sem], stu_enroll["level"][i_sem])
        else:
            # new semaster #
            enroll_stu[stu]["level"].append(level_update(i_sem, stu_enroll["cou_taken"][i_sem], stu_enroll["grade"][i_sem], stu_enroll["level"][-1]))

In [13]:
print enroll_stu[1]

{'grade': [[77.856805084562779, 84.092686141475852, 39.992905707529303, 98.771669783975526, 20.387813448678294], [86.915327883826137, 90.694724195659987], [], [72.619817695667351, 66.668924706987454, 99.97277629053427], [77.78514798272488, 43.243026531149255], [], [89.149408815385314], [90.565540628380887], [91.818163995643587, 44.356005098537636, 72.042030636064808], [29.452782149105989], [38.918529779446317], [30.295454951188912]], 'cou_taken': [[u'DMD100', u'DMD300', u'DMD400', u'DANCE221', u'DSM295W'], [u'AAAS397', u'ANSC300'], [], [u'HORT402', u'SOILS422', u'ECON014'], [u'FIN409', u'NAVSC313'], [], [u'ECON417W'], [u'DSM295A'], [u'LTNST100', u'GLIS497', u'CHE450'], [u'APLNG083'], [u'IET321'], [u'EDUC495A']], 'sub': u'Digital Multimedia Design (DMD)', 'level': [array([ 0.02017451,  0.18811806,  0.02823922,  0.19389287,  0.07242007,
        0.11931224,  0.1679577 ,  0.09660561,  0.11730596,  0.00123127,
        0.01262039,  0.02670677,  0.16915498,  0.04118929,  0.00987113,
        0

# generate grade book #

In [14]:
enroll_cou = cou_dict.copy()
for cid in enroll_cou:
    enroll_cou[cid]["stu_taken"] = [[] for j in range(N_semaster)]
    enroll_cou[cid]["grade"] = [[] for j in range(N_semaster)]
for i_sem in range(N_semaster):
    for stu in enroll_stu:
        cou_taken_sem = enroll_stu[stu]["cou_taken"][i_sem]
        grade_sem = enroll_stu[stu]["grade"][i_sem]
        for i_cou in range(len(cou_taken_sem)):
            cid = cou_taken_sem[i_cou]
            grade = grade_sem[i_cou]
            enroll_cou[cid]["stu_taken"][i_sem].append(stu)
            enroll_cou[cid]["grade"][i_sem].append(grade)

In [15]:
print enroll_cou["KOR120"]

{'prereq_courses': [], 'major': u'KOR', 'description': u'\n\t\t\t\tSurvey of Korean culture and society in historical contexts; exploration from antiquity to the contemporary period through diverse media. This course is designed as a multi-disciplinary introduction to Korean society. In surveying Korean culture from antiquity to the present, we will examine a wide range of primary sources from the past, including archaeological relics, written records, and works of art; as well as contemporary materials by Korean authors, directors, and other cultural producers, together with scholarly commentaries about these materials. Instruction and all materials will be in English. No preliminary knowledge of Korean history or language is required for taking this course. In the first part of the course that covers the origins of Korean \xbftradition,\xbf we will observe the formation and development of social relations, popular beliefs, and systems of thought that have shaped the Korean way of lif

In [16]:
# from score grade to ord #
ord_dict = {8: 0.90, 7: 0.80, 6: 0.70, 5: 0.60, 4: 0.50, 3: 0.40, 2: 0.30, 1: 0.1, 0: 0.0}
ord_dict_label = ["A", "A-", "B+", "B", "B-", "C+", "C", "D", "F"][::-1]
ord_dict_rev = ord_dict.keys()
ord_dict_rev.sort()

ord_top_pct = ord_dict.values()
ord_top_pct.sort()
ord_pct = []
for i in range(len(ord_top_pct)):
    if i < (len(ord_top_pct) - 1):
        ord_pct.append(ord_top_pct[i+1] - ord_top_pct[i])
    else:
        ord_pct.append(1.0 - ord_top_pct[i])
print ord_dict_rev
print ord_top_pct
print ord_pct
def grade2ord(grades):
    grades_cum = sum(grades, [])
    N_cum = len(grades_cum)
    grades_cum.sort()
#     print grades_cum
    ords_cum = np.random.choice(ord_dict_rev, N_cum, p=ord_pct, replace=True)
    ords_cum.sort()
    grade2ord_dict = {}
    for i in range(N_cum):
        grade2ord_dict[grades_cum[i]] = ord_dict_label[ords_cum[i]]
    return grade2ord_dict

[0, 1, 2, 3, 4, 5, 6, 7, 8]
[0.0, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
[0.1, 0.19999999999999998, 0.10000000000000003, 0.09999999999999998, 0.09999999999999998, 0.09999999999999998, 0.10000000000000009, 0.09999999999999998, 0.09999999999999998]


In [17]:
for cid in enroll_cou:
    grades = enroll_cou[cid]["grade"]
    grade2ord_dict = grade2ord(grades)
    enroll_cou[cid]["ordinal"] = [(map(lambda x: grade2ord_dict[x], enroll_cou[cid]["grade"][i_sem])) for i_sem in range(N_semaster)]

In [18]:
print enroll_cou["KOR120"]

{'prereq_courses': [], 'ordinal': [['A', 'A', 'C+', 'B+', 'B', 'A', 'F', 'B', 'B+', 'C', 'D', 'A', 'C+', 'B-', 'A-', 'B', 'F', 'A-', 'F', 'B', 'B-', 'A', 'B+', 'B-', 'D', 'F', 'B-', 'F', 'C+', 'D', 'F', 'B', 'B+', 'C', 'B+', 'C', 'F', 'C', 'B-', 'A', 'A', 'C', 'C+', 'D', 'D', 'B+', 'D', 'B', 'B+', 'C', 'B+', 'F', 'C+', 'A', 'B', 'C+', 'C', 'D', 'C+', 'B', 'D', 'B-', 'D', 'C+', 'A', 'C+', 'D', 'A', 'A', 'A', 'B-', 'C', 'C', 'B', 'B-', 'F', 'D', 'D', 'F', 'F', 'D', 'C', 'B', 'A', 'A', 'B', 'D', 'B', 'D', 'D', 'B+', 'D', 'D', 'B+', 'B+', 'F', 'A', 'B-', 'C', 'B+', 'A-', 'D', 'B+', 'D', 'B', 'B', 'D', 'C+', 'B-', 'D', 'B-', 'A-', 'C', 'A', 'B-', 'B-', 'B-', 'C', 'D', 'B+', 'D', 'A-', 'F', 'B+', 'C+', 'D', 'B+', 'A', 'C', 'B', 'F', 'A', 'C', 'C+', 'F', 'B', 'F', 'C+', 'A-', 'A', 'D', 'F', 'F', 'A-', 'B+', 'A-', 'B-', 'C', 'F', 'B-', 'D', 'D', 'B+', 'F', 'C', 'D', 'D', 'D', 'F', 'A-', 'B-', 'C+', 'D', 'D', 'D', 'C+', 'B+', 'C', 'D', 'D', 'D', 'F', 'B+', 'C', 'B-', 'C', 'C+', 'D', 'C', 'D', '

In [19]:
enroll_table = [[] for i_sem in range(N_semaster)]
for i_sem in range(N_semaster):
    enroll_table_sem = []
    for cid in enroll_cou:
        stu_taken = enroll_cou[cid]["stu_taken"][i_sem]
        ordinal = enroll_cou[cid]["ordinal"][i_sem]
        for i_stu in range(len(stu_taken)):
            enroll_table_sem.append([stu_taken[i_stu], cid, ordinal[i_stu]])
    enroll_table[i_sem] = enroll_table_sem
    

In [20]:
enroll_table[0][:3]

[[4076, u'PSYCH426', 'F'], [7217, u'PSYCH426', 'D'], [7238, u'PSYCH426', 'D']]

In [21]:
import csv
for i_sem in range(N_semaster):
    with open("data/enroll_table_sem_%02d.csv" % i_sem, "w") as df:
        writer = csv.writer(df, lineterminator='\n')
        writer.writerows(enroll_table[i_sem])

In [22]:
with open("data/sub_dict", "w") as df:
    cPickle.dump(sub_dict, df)

In [23]:
with open("data/cou_dict", "w") as df:
    cPickle.dump(cou_dict, df)

In [24]:
with open("data/stu_dict", "w") as df:
    cPickle.dump(stu_dict, df)

# prerequisite to sparse matrix graph #

In [25]:

# build prerequisite graph #
from scipy.sparse import csr_matrix
graph_sparse_data = []
graph_sparse_row = []
graph_sparse_col = []

# mapping bidirection from course name to cid #
cou_n2i = {}
cou_i2n = []
cou_cnt = 0
for cou_name in cou_dict:
    cou_n2i[cou_name] = cou_cnt
    cou_i2n.append(cou_name)
    cou_cnt += 1
with open("data/cou_n2i_i2n", "w") as df:
    cPickle.dump([cou_n2i, cou_i2n], df)

cou_dictionary = cou_n2i
for cid in cou_dict:
    cid_target_new = cou_dictionary[cid]
    for cid_pre in cou_dict[cid]["prereq_courses"]:
        cid_pre_new = cou_dictionary[cid_pre]
        graph_sparse_data.append(1)
        graph_sparse_row.append(cid_target_new)
        graph_sparse_col.append(cid_pre_new)

graph_sparse = csr_matrix((graph_sparse_data, (graph_sparse_row, graph_sparse_col)),[len(cou_dict), len(cou_dict)])

In [30]:
with open("data/graph_prereq", "w") as df:
    cPickle.dump([graph_sparse, 0], df)

# mandatory courses #

In [32]:
print sub_dict["Korean (KOR)"]

{'N_stu': 195, 'courses_list_ordered': [u'KOR120', u'KOR121', u'KOR424', u'KOR425', u'KOR494H', u'KOR099'], 'url': u'/university-course-descriptions/undergraduate/kor/', 'level': (array([ 0, 71, 33, 80, 85, 25, 88, 57, 37, 29]), array([ 0.394271  ,  0.92088794,  0.60890168,  0.69639774,  0.55481966,
        0.54539037,  0.82607731,  0.56352906,  0.66974169,  1.        ])), 'courses': {u'KOR424': {'prereq_courses': [u'KOR120', u'KOR121', u'ASIA100', u'ASIA102', u'ASIA004', u'CMLIT004'], 'ordinal': [['B', 'B-', 'C', 'D', 'B', 'A', 'C', 'B', 'D', 'A-', 'F', 'A-', 'F', 'D', 'F', 'D', 'B+', 'D', 'F', 'D', 'B+', 'B', 'D', 'F', 'C+', 'D', 'D', 'A-', 'C+', 'B-', 'B', 'D', 'D', 'D', 'D', 'B+', 'D', 'C', 'C', 'B+', 'A-', 'C', 'D', 'B-', 'D', 'D', 'D', 'B+', 'B-', 'D', 'B-', 'B+', 'B', 'F', 'D', 'F', 'B-', 'D', 'F', 'F', 'C', 'D', 'F', 'B-', 'A-', 'B+', 'B-', 'D', 'F', 'F', 'C+', 'F', 'B+', 'B', 'A-', 'B-', 'F', 'B', 'C', 'B+', 'C', 'F', 'C+', 'D', 'D', 'D', 'C', 'D', 'A', 'A-', 'F', 'A-', 'F', '

In [33]:
def mandatory_generate(sub, sub_dict=sub_dict, cou_dict=cou_dict, N_mand_sub=20, N_mand_out_max=3):
    mandatory_list = []
    sub_cou_list = sub_dict[sub]["courses_list_ordered"]
    mandatory_sub = np.random.choice(sub_cou_list, min(N_mand_sub, len(sub_cou_list)), replace=False).tolist()
    
    N_mand_out = int(np.random.random() * N_mand_out_max)
    mandatory_out = np.random.choice(cou_dict.keys(), N_mand_out, replace=False).tolist()
    mandatory_list = list(set(mandatory_sub) | set(mandatory_out))
    return mandatory_list

In [34]:
graph_sub_mandatory={}
for sub in sub_dict:
    graph_sub_mandatory[sub] = mandatory_generate(sub)

In [36]:
print graph_sub_mandatory["Korean (KOR)"]
with open("data/graph_sub_mandatory", "w") as f:
    cPickle.dump(graph_sub_mandatory, f)

[u'KOR424', u'KOR425', u'KOR494H', u'HIST150', u'KOR121', u'KOR099', u'KOR120', u'HUM150']


## course description dump ##

In [1]:
import cPickle
sub_dict = cPickle.load(open("./data/courses_crawled_by_major", "r"))
for sub in sub_dict:
    print sub_dict[sub]
    break

{'url': u'/university-course-descriptions/undergraduate/kor/', 'courses': {u'KOR424': {'prereq_courses': [u'KOR120', u'KOR121', u'ASIA100', u'ASIA102', u'ASIA004', u'CMLIT004'], 'major': u'KOR', 'description': u'\n\t\t\t\tExploration of seminal Korean texts, including poetry, fiction, autobiography, and criticism, from the early twentieth century to the contemporary era. This course provides a comprehensive overview of modern Korean literature within a transnational context. As we learn how to critically analyze seminal Korean texts, we will locate them in the social, political, economic, and cultural conditions under which they were produced and received. In grappling with some of the fundamental issues they raise;including colonialism, migration, national division, war, gender relations, developmentalism, urbanization, democratization, and contemporary consumer culture;we will also seek to situate these writings in the Korean vernacular within the larger context of global modernity. 

In [3]:
import warnings
cou_desc_dict = {}
for i_sub in sub_dict:
    sub_courses = sub_dict[i_sub]["courses"]
    sub_url = sub_dict[i_sub]["url"]
    for cou in sub_courses:
        if cou in cou_desc_dict:
            warnings.warn("duplicate cou name %s" % cou)
        cou_desc_dict[cou] = {}
        cou_desc_dict[cou]["description"] = sub_courses[cou]["description"]
        cou_desc_dict[cou]["credits"] = sub_courses[cou]["credits"]
        cou_desc_dict[cou]["name"] = sub_courses[cou]["title"]
        cou_desc_dict[cou]["url"] = sub_url
cPickle.dump(cou_desc_dict, open("./data/cou_desc_dict", 'w'))
