# 라이브러리 호출

In [1]:
import pandas as pd
import numpy as np
import os
import re

from konlpy.tag import *
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

tqdm.pandas()
okt = Okt()
kkma = Kkma()
komo = Komoran()
pd.set_option('display.max_row', 100)

# 경로 설정

## 절대 경로 설정

In [2]:
new_path = os.getcwd().split('run')[0].replace('\\', '/')[:-1]
os.chdir(new_path)
print(os.getcwd())

C:\Users\USER\Desktop\Everytime


## 경로 입력

In [3]:
eval_path = './data/et_evaluation/'
sche_path = './data/et_schedule/'
honey_path = './data/et_honey_dict/'

# 데이터 불러오기

In [4]:
evaluation = pd.read_csv(eval_path + 'evaluation.csv', encoding = 'UTF-8-SIG')
evaluation.reset_index(inplace = True)
evaluation.columns = ['id'] + list(evaluation.columns)[1:]
evaluation.head()

Unnamed: 0,id,과목명,교수명,전체평점,과제,조모임,학점비율,출결,시험횟수,개별평점,수강학기,수강평
0,0,17세기영문학,김태원,0.0,,,,,,,,
1,1,1960년대의저항문화,노재호,5.0,보통,없음,비율 채워줌,지정좌석,두 번,100.0,19년 2학기,정말 많은 걸 배워간 수업. 다른 분이 말씀하신 대로 처음엔 저항문화라는 게 한 학...
2,2,1960년대의저항문화,노재호,5.0,보통,없음,비율 채워줌,지정좌석,두 번,100.0,18년 1학기,"전체학생을 조로 나누고 매 수업 시작마다 의논주제를 제시하고, 주제에 대한 조별 ..."
3,3,1960년대의저항문화,노재호,5.0,보통,없음,비율 채워줌,지정좌석,두 번,100.0,18년 1학기,처음엔 저항문화를 왜 배우지..?라고 생각했는데 얻어가는 것도 많고 생각보다 재밌었...
4,4,1960년대의저항문화,노재호,5.0,보통,없음,비율 채워줌,지정좌석,두 번,100.0,18년 1학기,수업 : 15-20분 조별토론 + 강의시험 : 각 문단마다 텍스트나 음악 저자/가수...


# 데이터 기본 전처리

In [5]:
evaluation.dropna(inplace = True)

## 학점 비율

In [6]:
def change_grade(x):
    if x == '학점느님':
        x = int(6)
    elif x == '비율 채워줌':
        x = int(2)
    elif x == '매우 깐깐함':
        x = int(-2)
    else:
        x = int(-6)
    return x

In [7]:
evaluation['학점비율'] = evaluation['학점비율'].apply(lambda x : change_grade(x))

In [8]:
grade_score = evaluation.groupby(['과목명', '교수명'])['학점비율'].mean().to_frame().reset_index().sort_values(by = '과목명')

## 전체 평점

In [9]:
total_score = evaluation.groupby(['과목명', '교수명'])['전체평점'].mean().to_frame().reset_index()
total_score = total_score[total_score['전체평점'] != 0.0].reset_index(drop = True)

## 강의평 개수

In [10]:
evaluation_num = evaluation.groupby(['과목명', '교수명'])['수강평'].unique().to_frame().reset_index()
evaluation_num['강의평개수'] = evaluation_num['수강평'].apply(lambda x : len(x))

## 강의 평가

In [11]:
def clean_string(text):
    recom = re.compile('[^ \.\,\?\!a-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+')
    text = recom.sub('', text)
    return text

In [12]:
evaluation['수강평'] = evaluation['수강평'].progress_apply(lambda x : clean_string(x))

100%|█████████████████████████████████████████████████████████████████████████| 20471/20471 [00:00<00:00, 70294.24it/s]


In [13]:
evaluation['토큰'] = evaluation['수강평'].progress_apply(lambda x : okt.morphs(x, stem = True))

100%|████████████████████████████████████████████████████████████████████████████| 20471/20471 [10:56<00:00, 31.17it/s]


In [14]:
evaluation['토큰'] = evaluation['토큰'].str.join(sep = ' ')

In [15]:
evaluation_frame1 = evaluation.groupby(['과목명', '교수명'])['수강평'].unique().to_frame().reset_index()
evaluation_frame1['수강평'] = evaluation_frame1['수강평'].str.join(sep = ' ')

evaluation_frame2 = evaluation.groupby(['과목명', '교수명'])['토큰'].unique().to_frame().reset_index()
evaluation_frame2['토큰'] = evaluation_frame2['토큰'].str.join(sep = ' ')

## 과제

In [16]:
def change_homework(x):
    if x == '많음':
        x = int(-3)
    elif x == '보통':
        x = int(0)
    else:
        x = int(3)
    return x

In [17]:
evaluation['과제'] = evaluation['과제'].apply(lambda x : change_homework(x))

In [18]:
homework_score = evaluation.groupby(['과목명', '교수명'])['과제'].mean().to_frame().reset_index()

## 조모임

In [19]:
def change_teamwork(x):
    if x == '없음':
        x = int(3)
    else:
        x = int(-6)
    return x

In [20]:
evaluation['조모임'] = evaluation['조모임'].apply(lambda x : change_teamwork(x))

In [21]:
teamwork_score = evaluation.groupby(['과목명', '교수명'])['조모임'].mean().to_frame().reset_index()

## 시험횟수

In [22]:
def change_count(x):
    if x == '없음':
        x = 0
    elif x == '한 번':
        x = 1
    elif x == '두 번':
        x = 2
    elif x == '세 번':
        x = 3
    else:
        x = 4
    return x

In [23]:
evaluation['시험횟수'] = evaluation['시험횟수'].apply(lambda x : change_count(x))

In [24]:
exam_num = evaluation.groupby(['과목명', '교수명'])['시험횟수'].mean().to_frame().reset_index()

## 데이터 병합

In [25]:
result = pd.concat([grade_score, total_score.iloc[:, 2], homework_score.iloc[:, 2], teamwork_score.iloc[:, 2],
                                   exam_num.iloc[:, 2], evaluation_num.iloc[:, 3], evaluation_frame1.iloc[:, 2], evaluation_frame2.iloc[:, 2]], axis = 1)

In [26]:
result.dropna(inplace = True)
result.reset_index(inplace = True, drop = True)

In [27]:
result.to_csv(honey_path + 'result.csv', index = False, encoding = 'UTF-8-SIG')

# 데이터 심화 전처리

In [28]:
result = pd.read_csv(honey_path + 'result.csv', encoding = 'UTF-8-SIG')
result.dropna(inplace = True)
result.reset_index(inplace = True, drop = True)

In [29]:
result['꿀수강'] = 0
result['배움수강'] = 0

In [30]:
positive_honey_lst = ['God', 'HOnEy', 'honey', 'sofie', '갓', '갓향숙', '강추예욤', '개꿀', '개꿀강임',
                                        '개꿀스피치', '개띵강', '꿀강', '꿀맛', '꿀잼일', '대천사그래도', '비쁠나옴쁠몰',
                                        '빛신', '쁠몰같아', '쁠몰이셔', '쁠받', '수월하다쁠몰', '에쁠에제', '좋다천사이십',
                                        '핵꿀', '꿀', '닥추임', '더꿀', '명강꿀', '빛빛빛', '안빡셉니', '에쁠에쁠', '초초강추',
                                        '킹', '함꿀강', '후해짐', '힐링', '영접', '에이쁠', '학점느님', '퍼주실', '퍼줍니', '핵핵']
positive_study_lst = ['배울', '빡셈', '빡세다', '개빡', '공부량', '질높', '꼽는', '명강', '노고', '빡공', '사례분석', '9학점이라', '9학점같은', '9학점짜리', '9학점입니다'
                                       '9학점만큼', '9학점 수업', '아우라', '어려움', '어렵다', '어료워', '유의사항', '착취', '참뜻', '컨설턴트', '컨설팅계', '컨설팅쪽', '컨설팅펌'
                                       '특효약', '헬', '배우는', '너무 어려']

## 꿀 강의평가

###### 꿀 단어 추출

In [31]:
hn = result[(result['학점비율'] == 6) & (result['조모임'] == 3) & (result['전체평점']  >= 4) & (result['시험횟수'] != 4) & (result['시험횟수'] != 3) & (result['강의평개수'] > 2)]
hl = result[~((result['학점비율'] == 6) & (result['조모임'] == 3) & (result['전체평점']  >= 4) & (result['시험횟수'] != 4) & (result['시험횟수'] != 3) & (result['강의평개수'] > 2))]

In [32]:
hn_final = ''
hn_lst = list(hn['토큰'].values)

hl_final = ''
hl_lst = list(hl['토큰'].values)

In [33]:
for i in hn_lst:
    hn_final += i

In [34]:
for i in tqdm(hl_lst):
    hl_final += i

100%|█████████████████████████████████████████████████████████████████████████████| 2390/2390 [00:03<00:00, 669.20it/s]


In [35]:
hn_final = hn_final.split(' ')
hn_final = sorted(list(set(hn_final)))

hl_final = hl_final.split(' ')
hl_final = sorted(list(set(hl_final)))

In [36]:
honey_final = sorted(list(set(hn_final) - set(hl_final)))
honey_final = [re.compile('^[?!.,0123456789]+').sub('', i) for i in honey_final]
honey_final = [re.compile('^[?!.,0123456789]+').sub('', i) for i in honey_final]
honey_final = sorted(list(set(honey_final)))

In [37]:
for i in tqdm(range(len(result))):
    cnt = 0
    for text in positive_honey_lst:
        cnt += result['토큰'][i].count(text)
        if text in ['개꿀', '힐링']:
            cnt += 2
    result['꿀수강'][i] = cnt

100%|█████████████████████████████████████████████████████████████████████████████| 2640/2640 [00:03<00:00, 872.46it/s]


## 배움 강의평가

###### 배움 단어 추출

In [38]:
lot_outcome = result[(result['전체평점']  >= 4) & (result['강의평개수'] > 3) & (result['시험횟수'] >= 2) & (result['과제'] == -3)]
not_outcome = result[~((result['전체평점']  >= 4) & (result['강의평개수'] > 3) & (result['시험횟수'] >= 2) & (result['과제'] != 3))]

In [39]:
lot_outcome_final = ''
lot_outcome_lst = list(lot_outcome['토큰'].values)

not_outcome_final = ''
not_outcome_lst = list(not_outcome['토큰'].values)

In [40]:
for i in lot_outcome_lst:
    lot_outcome_final += i
    
for i in not_outcome_lst:
    not_outcome_final += i

In [41]:
lot_outcome_final = lot_outcome_final.split(' ')
lot_outcome_final = sorted(list(set(lot_outcome_final)))

not_outcome_final = not_outcome_final.split(' ')
not_outcome_final = sorted(list(set(not_outcome_final)))

In [42]:
study_final = sorted(list(set(lot_outcome_final) - set(not_outcome_final)))
study_final = [re.compile('^[?!.,0123456789]+').sub('', i) for i in study_final]
study_final = [re.compile('^[?!.,0123456789]+').sub('', i) for i in study_final]
study_final = sorted(list(set(study_final)))

In [43]:
for i in tqdm(range(len(result))):
    cnt = 0
    for text in positive_study_lst:
        cnt += result['수강평'][i].count(text)
        cnt += result['토큰'][i].count(text)
        if text in ['명강',  '9학점이라', '9학점같은', '9학점짜리', '9학점입니다', '9학점만큼', '9학점 수업', '빡공']:
            cnt += 1
    result['배움수강'][i] = cnt

100%|█████████████████████████████████████████████████████████████████████████████| 2640/2640 [00:03<00:00, 832.93it/s]


# 최종 점수

In [44]:
result['honey_score'] = (1 + result['강의평개수']/100) * (result['전체평점'] + result['학점비율'] + result['과제'] + result['조모임'] + (result['꿀수강']/result['강의평개수'] ))
result['study_score'] = (1 + result['강의평개수']/20) * (result['전체평점'] - result['과제'] - result['시험횟수'] + (result['배움수강']/result['강의평개수']))

In [45]:
result.reset_index(drop = True, inplace = True)
result.to_csv(honey_path + 'honeycombo.csv', index = False, encoding = 'UTF-8-SIG')