In [None]:
# make descrpition

from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

import pandas as pd
import numpy as np
table_df = pd.read_csv('./source/data/table.csv', encoding ='utf-8')

col_name = ['개설학과전공', '학수번호', '분반', '교과목명',
            '강의언어', '이수구분', '선택영역', '학점/이론/실습',
            '학년(학기)', '대상과정', '교과목개요', '수업계획서',
            '주관학과', '교수명', '요일 및 강의시간', '강의실', '사이버강좌']
tmp_df = table_df.iloc[:,2:-3].loc[~table_df.iloc[:,2:-4]['1'].isnull()]
tmp_df.columns = col_name
tmp_df['학점'] = tmp_df['학점/이론/실습'].apply(lambda x : round(float(x.split('/')[0].strip())))
for col in ['분반', '학년(학기)', '학점']: tmp_df[col] = tmp_df[col].astype(np.int8)
tmp_df.fillna('', inplace=True)
tmp_df.reset_index(drop=True, inplace=True)
tmp_df.to_csv('./source/preprocessing_table.csv')


llm = ChatOpenAI(model="gpt-3.5-turbo")
prompt = PromptTemplate(
    input_variables=["course_name"],
    template="수업 이름을 보고 수업에 대한 설명을 2줄 정도로 생성해줘: {course_name}."
)

chain = LLMChain(llm=llm, prompt=prompt)
def generate_description(course_name):
    description = chain.run(course_name)
    return description

course_name_list = list(tmp_df['교과목명'].unique())
course_description_list = []
for course_name in course_name_list:
    course_description_list.append(generate_description(course_name))

gpt_df = pd.DataFrame({'교과목명':course_name_list, '교과목설명':course_description_list})
gpt_df.to_csv('./source/generate_course_description.csv')

pd.merge(tmp_df, gpt_df, on='교과목명', how='inner').to_csv('./source/generate_preprocessing_table.csv')

In [4]:
import pandas as pd
import numpy as np
import json
import re

rename_dict = {'개설학과전공' : 'department_major',
'학수번호' : 'courseNumber',
'분반' : 'sectionNumber',
'교과목명' : 'courseName',
'강의언어' : 'lectureLanguage',
'이수구분' : 'courseClassification',
'선택영역' : 'electiveArea',
'학년(학기)' : 'yearSemester',
'대상과정' : 'targetCourse',
'주관학과' : 'mainDepartment',
'교수명' : 'professorName',
'과목요일 및 강의 시간' : 'dayTime',
'강의실' : 'classroom',
'사이버강좌' : 'onlineCourse',
'학점' : 'credits',
'교과목설명' : 'courseDescription',
'강의요일' : 'courseDay',
'강의시간' : 'courseTime',
}

df = pd.read_csv('./source/data/generate_preprocessing_table.csv', index_col = 0)
BSM = ["미적분학1", "일반물리및시뮬레이션1", "공업수학1", "이산수학및프로그래밍", "통계학개론", "선형대수및프로그래밍"]
MSC = ["일변수미적분학", "공업수학1", "공업수학2", "선형대수", "확률및랜덤변수", "일반물리학1", "일반물리학2", "일반화학1", "고급프로그래밍활용"]
df['BSM'] = df['교과목명'].apply(lambda x : x in BSM)
df['MSC'] = df['교과목명'].apply(lambda x : x in MSC)
def day_time_split(x):
    if type(x) == float: return np.nan, np.nan
    day = re.sub(r'[^가-힣]+', '', x)
    time = x.replace(day, '')
    return day, time
tmp_list = df['요일 및 강의시간'].apply(day_time_split)
df['강의요일'] = [row[0] for row in tmp_list]
df['강의시간'] = [row[1] for row in tmp_list]
final_df = df.rename(columns=rename_dict).drop(columns=['학점/이론/실습', '교과목개요','수업계획서', '요일 및 강의시간'])

result = final_df.to_json(orient='index')
parsed = json.loads(result)
with open('./source/data/course_data.json', 'w') as f:
    json.dump(parsed, f, indent=4)