# Main

In [10]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from copy import deepcopy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [11]:
data = pd.read_csv('preprocess.csv')

data

Unnamed: 0,name,category
0,아동영어보육과,0
1,아동영어보육학과,0
2,유아특수보육전공,1
3,특수보육계열,1
4,컴퓨터보육과,2
...,...,...
27112,스마트이동체융합시스템공학과,3
27113,학과간협동과정항만물류시스템학과,3
27114,동북아물류시스템학과,3
27115,항만물류시스템,3


In [12]:
def category_str(data, classification=['인문', '사회', '교육', '공학', '자연', '의약', '예체능']):
    copy_data = deepcopy(data)
    for i in range(7):
        copy_data.loc[ copy_data['category']==i , 'category' ] = classification[i]
    return copy_data

In [13]:
def preprocess_name(text):
    result = ''
    
    # 빈칸제거
    result = text.strip()
    
    # 한글만 추출
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')  # 한글과 띄어쓰기를 제외한 모든 글자
    result = hangul.sub('', result)
    
    # 1차
    remove_words = ['학과', '전공', '학부', '대학', '계열', '단과대학없음', '프로그램', '과']
    result = result.strip().replace(' ', '').strip()
    
    if result in remove_words:
        return '0'
    else:
        return result

In [5]:
# data1 = category_str(data)

# data1

Unnamed: 0,name,category
0,아동영어보육과,인문
1,아동영어보육학과,인문
2,유아특수보육전공,사회
3,특수보육계열,사회
4,컴퓨터보육과,교육
...,...,...
27112,스마트이동체융합시스템공학과,공학
27113,학과간협동과정항만물류시스템학과,공학
27114,동북아물류시스템학과,공학
27115,항만물류시스템,공학


In [14]:
data2 = category_str(data, ['인문/사회', '인문/사회', '교육', '공학', '자연', '의약', '예체능'])

data2

Unnamed: 0,name,category
0,아동영어보육과,인문/사회
1,아동영어보육학과,인문/사회
2,유아특수보육전공,인문/사회
3,특수보육계열,인문/사회
4,컴퓨터보육과,교육
...,...,...
27112,스마트이동체융합시스템공학과,공학
27113,학과간협동과정항만물류시스템학과,공학
27114,동북아물류시스템학과,공학
27115,항만물류시스템,공학


In [15]:
x = data2['name'].to_list()

x

['아동영어보육과',
 '아동영어보육학과',
 '유아특수보육전공',
 '특수보육계열',
 '컴퓨터보육과',
 '가정보육과',
 '보육과',
 '보육상담전공',
 '보육학과',
 '사회복지보육과',
 '생활보육과',
 '아동미술보육전공',
 '아동보육과',
 '아동복지보육과',
 '영유아보육과',
 '유아교육과',
 '유아교육전공',
 '유아교육학과',
 '유아미술음악전공',
 '아동미술보육과',
 '아동미술복지보육과',
 '아동보육학습지도과',
 '아동생활보육과',
 '아동컴퓨터보육과',
 '아동보육전공',
 '방과후아동보육과',
 '아동보육복지과',
 '유아특수보육과',
 '아동미술보육계열',
 '아동보육계열',
 '유아보육과',
 '유아보육미술과',
 '사회복지보육전공',
 '특수보육과',
 '아동문학보육과',
 '아동미술복지보육전공',
 '유아교육과년제',
 '영유아보육복지과',
 '아동복지미술보육과',
 '유아교육과인문',
 '미술보육과',
 '아동보육복지학과',
 '영유아보육전공',
 '아동미디어미술보육과',
 '아동표현미술보육과',
 '유아교육학과년제',
 '영재미술보육전공',
 '보육복지과',
 '아동보육학과',
 '영유아보육학부',
 '어린이영어보육과',
 '아동보육복지전공',
 '사회복지과아동보육복지전공',
 '유아특수보육과년제',
 '아동보육복지상담과',
 '아동보육과인문사회',
 '아동미술보육과년제',
 '유아보육문화계열',
 '영유아보육학과',
 '아동복지보육과년제',
 '유아보육계열',
 '유아안전보육과',
 '아동건강보육과',
 '아동보육과년제',
 '아동문화계열',
 '아동상담보육과',
 '유아특수보육학과',
 '아동컴퓨터보육전공',
 '아동조형미술보육과',
 '아동미디어미술보육전공',
 '아동표현미술보육전공',
 '아동조형미술보육전공',
 '유아특수치료교육',
 '유아특수치료교육과',
 '유아특수재활과',
 '장애유아보육과',
 '특수아동재활과',
 '유아특수언어재활과',
 '유아특수재활복지과',
 '특수아동언어재활과',
 '유아특

In [16]:
model = SentenceTransformer('jhgan/ko-sroberta-multitask')
embeddings = model.encode(x)

embeddings.shape

(27117, 768)

In [17]:
scaler = StandardScaler()
scale_embeddings = scaler.fit_transform(embeddings)

scale_embeddings.shape

(27117, 768)

In [18]:
def pca_decide(n, data=scale_embeddings):
    pca = PCA(n_components=n)  # 주성분을 몇개로 할지 결정
    test = pca.fit_transform(data)
    df = pd.DataFrame(data=test, columns=[f"pca{num+1}" for num in range(n)])
    
    var = pca.explained_variance_
    ratio = pca.explained_variance_ratio_

    results = pd.DataFrame({'설명가능한 분산 비율(고윳값)':var, '기여율':ratio},
                         index = np.array([f"pca{num+1}" for num in range(n)]))
    results['누적기여율'] = results['기여율'].cumsum()
    
    if results['누적기여율'][-1] >= 0.8:
        return results
    

def find_dimension():
    for n in range(3, 100):
        result = pca_decide(n)
        try:
            if result['설명가능한 분산 비율(고윳값)'][-1] >= 0.7:
                print(result)
                return n
        except:
            pass

In [19]:
dimension = find_dimension()

print(f"{dimension} 차원")

       설명가능한 분산 비율(고윳값)       기여율     누적기여율
pca1          42.979980  0.055962  0.055962
pca2          36.224850  0.047166  0.103128
pca3          33.651028  0.043815  0.146943
pca4          26.627048  0.034669  0.181612
pca5          23.817713  0.031012  0.212624
...                 ...       ...       ...
pca74          2.142214  0.002789  0.791132
pca75          2.129264  0.002772  0.793905
pca76          2.003849  0.002609  0.796514
pca77          1.981020  0.002579  0.799093
pca78          1.954286  0.002545  0.801638

[78 rows x 3 columns]
78 차원


In [20]:
pca = PCA(n_components=dimension) # 주성분을 몇개로 할지 결정
test = pca.fit_transform(scale_embeddings)
X = pd.DataFrame(data=test, columns=[f"pca{num+1}" for num in range(dimension)])

X

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,...,pca69,pca70,pca71,pca72,pca73,pca74,pca75,pca76,pca77,pca78
0,-10.551517,-0.762789,-0.160792,7.201996,2.393227,2.654544,6.030200,1.441220,8.255738,-1.908265,...,2.143013,-0.718024,-1.507004,-0.134653,0.081577,-0.518107,0.248059,-1.234621,0.804927,2.152613
1,-8.657663,-1.747989,1.240416,6.341091,-2.442751,3.103964,3.576728,-2.316809,4.974391,-1.695308,...,1.724797,-0.422934,-2.202695,0.232058,-0.796056,0.229434,0.388235,-0.606766,0.350321,1.204953
2,-4.485754,-2.786312,-0.961343,9.138304,-3.725683,3.920298,-3.522280,1.608586,6.323410,-1.267140,...,1.852097,-1.569826,1.624124,4.441761,-0.743415,-1.454284,-0.290925,0.899456,-0.336895,-0.256461
3,-5.706591,-1.581082,-3.901403,8.444764,0.884110,1.012693,-1.870375,3.692609,4.373980,-1.916260,...,-1.376781,1.692728,4.194564,1.885287,-0.531814,-2.069133,0.434823,-0.386848,0.773655,-1.944326
4,-2.114214,-1.476669,5.808393,10.699862,5.400627,4.259531,4.669381,2.216633,6.183870,-2.881552,...,0.616768,-0.946981,-0.496896,2.135680,0.741738,-2.455003,0.093950,0.618799,0.457912,1.144505
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27112,9.792565,-3.351491,6.869936,0.304746,3.303158,-0.960769,-2.588494,4.522178,1.489480,7.108863,...,1.044940,1.081842,0.371189,0.721289,-1.155201,0.897518,0.660205,0.923022,-0.216323,0.580020
27113,4.715868,-4.266425,0.707220,-6.709475,1.166910,-0.377677,-6.187236,1.759525,6.464411,-0.776125,...,-0.898674,-0.836179,1.332489,0.509388,-1.053253,-0.001382,-0.220507,-0.889474,-0.382076,-0.124616
27114,-0.030062,-5.204030,1.515123,-13.278943,3.867889,-2.989587,-0.760812,1.778826,4.785962,0.246580,...,0.394332,-2.097805,0.678070,2.180882,-2.196235,-0.013791,-0.007041,0.144508,0.046608,-1.430938
27115,-0.031056,-2.405836,0.565072,-9.701786,11.256224,-3.153981,2.138429,5.574642,5.876943,0.915326,...,-0.675392,-2.468556,0.322701,0.578440,-1.040924,-1.260381,2.271997,-0.577835,-3.331146,-1.035971


In [21]:
y = data2['category']

y

0        인문/사회
1        인문/사회
2        인문/사회
3        인문/사회
4           교육
         ...  
27112       공학
27113       공학
27114       공학
27115       공학
27116       공학
Name: category, Length: 27117, dtype: object

In [22]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(21693, 78)
(5424, 78)
(21693,)
(5424,)


In [23]:
rfm = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=3)
rfm.fit(x_train.values, y_train.values)

RandomForestClassifier(max_depth=20, min_samples_leaf=3)

In [24]:
pred = rfm.predict(x_test.values)

print(y_test.values, '\n')
print(pred)

['공학' '자연' '교육' ... '공학' '예체능' '인문/사회'] 

['공학' '자연' '교육' ... '공학' '예체능' '인문/사회']


In [25]:
acc = accuracy_score(y_test, pred)

print('Acc: {:.2f}%'.format(acc*100))

Acc: 84.81%


# find fail

In [26]:
y_test

24037       공학
17785       자연
20375       교육
13784       자연
9070       예체능
         ...  
1686     인문/사회
14349       공학
14662       공학
7894       예체능
11852    인문/사회
Name: category, Length: 5424, dtype: object

In [27]:
pred

array(['공학', '자연', '교육', ..., '공학', '예체능', '인문/사회'], dtype=object)

In [28]:
y_test.index

Int64Index([24037, 17785, 20375, 13784,  9070, 12943,  6141, 26458, 24740,
            10734,
            ...
            25790, 24949,   395,  7318, 20697,  1686, 14349, 14662,  7894,
            11852],
           dtype='int64', length=5424)

In [29]:
index_list, leng = [], len(y_test)
for yt, pr, i in zip(y_test, pred, [l for l in range(leng)]):
    if yt != pr:
        index_list.append(i)

In [30]:
# 틀린 과목 리스트
x_name = []
for i in index_list:
    x_name.append(x[i])
    
x_name

['아동미술보육전공',
 '아동보육전공',
 '사회복지보육전공',
 '아동표현미술보육과',
 '유아교육학과년제',
 '영재미술보육전공',
 '아동복지보육과년제',
 '유아안전보육과',
 '아동건강보육과',
 '아동문화계열',
 '아동상담보육과',
 '유아특수언어재활과',
 '아동체육무용과',
 '미술심리보육과',
 '아동무용지도자과',
 '아동미술과',
 '환경교육과',
 '아동평생교육학부',
 '디자인계열',
 '디자인학전공',
 '디자인과',
 '디지털미디어과',
 '문예영상창작과',
 '방송극작',
 '방송매체과',
 '방송보도제작계열',
 '방송영상사진과',
 '신문방송과',
 '인터넷방송전공',
 '아동미디어과',
 '방송연예제작학부',
 '광고디자인영상과',
 '미디어커뮤니케이션과',
 '방송미디어학과',
 '방송제작전공',
 '방송영상디지털계열',
 '첨단방송영상학과',
 '미디어출판학과',
 '컴퓨터멀티미디어전공',
 '게임소프트웨어전공',
 '스마트폰미디어학과',
 '디지털방송콘텐츠학과',
 '게임제작계열',
 '인터넷게임전공',
 '컴퓨터정보게임계열',
 '멀티게임영상과',
 '게임콘텐츠전공',
 '영상게임콘텐츠과',
 '방송음향영상학부',
 '멀티미디어통신학과',
 '출판미디어과',
 '광고사인디자인전공',
 '시각디자인계열',
 '시각디자인과',
 '캐릭터시각디자인전공',
 '디자인학부산업디자인과시각디자인전공',
 '시각디자인과년제',
 '디자인학부시각디자인전공',
 '시각디자인과시각정보디자인전공',
 '광고멀티미디어디자인전공',
 '그래픽디자인',
 '디지털미디어디자인학과',
 '디지털컨텐트디자인전공',
 '영상디자인과',
 '캐릭터디자인전공',
 '광고영상디자인전공',
 '디지털디자인전공',
 '캐릭터디자인과',
 '컴퓨터광고디자인전공',
 '산업디자인계열컴퓨터그래픽디자인전공',
 '컴퓨터영상디자인과',
 '애니메이션디자인전공',
 '콘텐츠디자인전공',
 '영상디자인학부',
 '콘텐츠디자인계열',
 '미디어디자인과예체능',
 '광고홍보

# Seogang Test

In [31]:
seogang_class = [
    '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', 
    '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', 
    '예체능', '예체능',  '예체능', '예체능', '예체능', '예체능', 
    '자연', '자연', '자연', '자연', 
    '공학', '공학', '공학', '공학', '공학', '공학'
]

seogang_major = [
    '국어국문학', '사학', '철학', '종교학', '영문학부', '유럽문화', '독일문화', '프랑스문화', '중국문화', '일본문화', 
    '사회학', '정치외교학', '심리학', '경제학', '경영학', 
    '국제한국학', '아트&테크놀로지', '신문방송학', '미디어&엔터테인먼트', '글로벌 한국학', '커뮤니케이션학', 
    '수학', '물리학', '화학', '생명과학', 
    '전자공학', '화공생명공학', '컴퓨터공학', '기계공학', '인공지능학', '시스템반도체공학'
]

In [32]:
majors = []
for word in seogang_major:
    majors.append(preprocess_name(word))
    
majors

['국어국문학',
 '사학',
 '철학',
 '종교학',
 '영문학부',
 '유럽문화',
 '독일문화',
 '프랑스문화',
 '중국문화',
 '일본문화',
 '사회학',
 '정치외교학',
 '심리학',
 '경제학',
 '경영학',
 '국제한국학',
 '아트테크놀로지',
 '신문방송학',
 '미디어엔터테인먼트',
 '글로벌한국학',
 '커뮤니케이션학',
 '수학',
 '물리학',
 '화학',
 '생명과학',
 '전자공학',
 '화공생명공학',
 '컴퓨터공학',
 '기계공학',
 '인공지능학',
 '시스템반도체공학']

In [33]:
emb = model.encode(majors)

scale_emb = scaler.transform(emb)

scale_emb.shape

(31, 768)

In [34]:
tes = pca.transform(scale_emb)

tes.shape

(31, 78)

In [35]:
pre = rfm.predict(tes)

print(seogang_class, '\n')
print(pre)

['인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '예체능', '예체능', '예체능', '예체능', '예체능', '예체능', '자연', '자연', '자연', '자연', '공학', '공학', '공학', '공학', '공학', '공학'] 

['인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회'
 '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '예체능' '인문/사회'
 '인문/사회' '인문/사회' '인문/사회' '인문/사회' '자연' '공학' '자연' '공학' '공학' '공학' '공학' '공학'
 '공학']


In [36]:
Acc = accuracy_score(seogang_class, pre)

print('Acc: {:.2f}%'.format(Acc*100))

Acc: 77.42%
