In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from copy import deepcopy
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('preprocess.csv')

data

Unnamed: 0,name,category
0,아동영어보육과,0
1,아동영어보육학과,0
2,유아특수보육전공,1
3,특수보육계열,1
4,컴퓨터보육과,2
...,...,...
27112,스마트이동체융합시스템공학과,3
27113,학과간협동과정항만물류시스템학과,3
27114,동북아물류시스템학과,3
27115,항만물류시스템,3


In [3]:
def category_str(data, classification=['인문', '사회', '교육', '공학', '자연', '의약', '예체능']):
    copy_data = deepcopy(data)
    for i in range(7):
        copy_data.loc[ copy_data['category']==i , 'category' ] = classification[i]
    return copy_data

In [4]:
def preprocess_name(text):
    result = ''
    
    # 빈칸제거
    result = text.strip()
    
    # 한글만 추출
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')  # 한글과 띄어쓰기를 제외한 모든 글자
    result = hangul.sub('', result)
    
    # 1차
    remove_words = ['학과', '전공', '학부', '대학', '계열', '단과대학없음', '프로그램', '과']
    result = result.strip().replace(' ', '').strip()
    
    if result in remove_words:
        return '0'
    else:
        return result

In [5]:
# data1 = category_str(data)

# data1

In [6]:
data2 = category_str(data, ['인문/사회', '인문/사회', '교육', '공학', '자연', '의약', '예체능'])

data2

Unnamed: 0,name,category
0,아동영어보육과,인문/사회
1,아동영어보육학과,인문/사회
2,유아특수보육전공,인문/사회
3,특수보육계열,인문/사회
4,컴퓨터보육과,교육
...,...,...
27112,스마트이동체융합시스템공학과,공학
27113,학과간협동과정항만물류시스템학과,공학
27114,동북아물류시스템학과,공학
27115,항만물류시스템,공학


In [7]:
x = data2['name'].to_list()

x

['아동영어보육과',
 '아동영어보육학과',
 '유아특수보육전공',
 '특수보육계열',
 '컴퓨터보육과',
 '가정보육과',
 '보육과',
 '보육상담전공',
 '보육학과',
 '사회복지보육과',
 '생활보육과',
 '아동미술보육전공',
 '아동보육과',
 '아동복지보육과',
 '영유아보육과',
 '유아교육과',
 '유아교육전공',
 '유아교육학과',
 '유아미술음악전공',
 '아동미술보육과',
 '아동미술복지보육과',
 '아동보육학습지도과',
 '아동생활보육과',
 '아동컴퓨터보육과',
 '아동보육전공',
 '방과후아동보육과',
 '아동보육복지과',
 '유아특수보육과',
 '아동미술보육계열',
 '아동보육계열',
 '유아보육과',
 '유아보육미술과',
 '사회복지보육전공',
 '특수보육과',
 '아동문학보육과',
 '아동미술복지보육전공',
 '유아교육과년제',
 '영유아보육복지과',
 '아동복지미술보육과',
 '유아교육과인문',
 '미술보육과',
 '아동보육복지학과',
 '영유아보육전공',
 '아동미디어미술보육과',
 '아동표현미술보육과',
 '유아교육학과년제',
 '영재미술보육전공',
 '보육복지과',
 '아동보육학과',
 '영유아보육학부',
 '어린이영어보육과',
 '아동보육복지전공',
 '사회복지과아동보육복지전공',
 '유아특수보육과년제',
 '아동보육복지상담과',
 '아동보육과인문사회',
 '아동미술보육과년제',
 '유아보육문화계열',
 '영유아보육학과',
 '아동복지보육과년제',
 '유아보육계열',
 '유아안전보육과',
 '아동건강보육과',
 '아동보육과년제',
 '아동문화계열',
 '아동상담보육과',
 '유아특수보육학과',
 '아동컴퓨터보육전공',
 '아동조형미술보육과',
 '아동미디어미술보육전공',
 '아동표현미술보육전공',
 '아동조형미술보육전공',
 '유아특수치료교육',
 '유아특수치료교육과',
 '유아특수재활과',
 '장애유아보육과',
 '특수아동재활과',
 '유아특수언어재활과',
 '유아특수재활복지과',
 '특수아동언어재활과',
 '유아특

In [8]:
model = SentenceTransformer('jhgan/ko-sroberta-multitask')
embeddings = model.encode(x)

embeddings.shape

(27117, 768)

In [9]:
scaler = StandardScaler()
scale_embeddings = scaler.fit_transform(embeddings)

scale_embeddings.shape

(27117, 768)

In [10]:
def pca_decide(n, data=scale_embeddings):
    pca = PCA(n_components=n)  # 주성분을 몇개로 할지 결정
    test = pca.fit_transform(data)
    df = pd.DataFrame(data=test, columns=[f"pca{num+1}" for num in range(n)])
    
    var = pca.explained_variance_
    ratio = pca.explained_variance_ratio_

    results = pd.DataFrame({'설명가능한 분산 비율(고윳값)':var, '기여율':ratio},
                         index = np.array([f"pca{num+1}" for num in range(n)]))
    results['누적기여율'] = results['기여율'].cumsum()
    
    if results['누적기여율'][-1] >= 0.8:
        return results
    

def find_dimension():
    for n in range(3, 100):
        result = pca_decide(n)
        try:
            if result['설명가능한 분산 비율(고윳값)'][-1] >= 0.7:
                print(result)
                return n
        except:
            pass

In [21]:
dimension = find_dimension()

print(f"{dimension} 차원")

       설명가능한 분산 비율(고윳값)       기여율     누적기여율
pca1          42.979969  0.055962  0.055962
pca2          36.224873  0.047166  0.103128
pca3          33.651028  0.043815  0.146943
pca4          26.627037  0.034669  0.181612
pca5          23.817713  0.031012  0.212624
...                 ...       ...       ...
pca74          2.162385  0.002816  0.791082
pca75          2.107677  0.002744  0.793826
pca76          2.024973  0.002637  0.796463
pca77          2.011824  0.002619  0.799082
pca78          1.967391  0.002562  0.801644

[78 rows x 3 columns]
78 차원


In [22]:
pca = PCA(n_components=dimension) # 주성분을 몇개로 할지 결정
test = pca.fit_transform(scale_embeddings)
X = pd.DataFrame(data=test, columns=[f"pca{num+1}" for num in range(dimension)])

X

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,...,pca69,pca70,pca71,pca72,pca73,pca74,pca75,pca76,pca77,pca78
0,-10.551537,-0.762775,-0.160745,7.201861,2.393254,2.654530,6.030195,1.441372,8.255816,-1.908117,...,1.049923,-1.320524,-1.524433,0.357707,0.437007,-0.565915,0.185978,0.971343,-1.348406,1.382653
1,-8.657743,-1.747990,1.240400,6.341300,-2.442683,3.103930,3.576960,-2.316741,4.974585,-1.694937,...,0.589120,-0.989491,-2.198092,0.590839,0.820283,0.680415,0.706436,0.680426,-0.941230,1.105813
2,-4.485785,-2.786307,-0.961320,9.138314,-3.725596,3.920368,-3.522317,1.608313,6.323623,-1.267553,...,2.292867,-0.988961,2.904807,3.130840,2.332872,-0.386782,-0.625766,0.476261,1.572197,-1.370518
3,-5.706595,-1.581038,-3.901399,8.444745,0.884113,1.012693,-1.870608,3.692781,4.374101,-1.916776,...,-0.366730,2.394121,5.103293,-0.536843,2.417065,-0.966940,0.683079,0.486850,2.501941,-1.943419
4,-2.114228,-1.476661,5.808383,10.699887,5.400667,4.259630,4.669299,2.216892,6.183907,-2.881601,...,0.144234,-1.237799,0.284154,1.829115,0.960550,-2.840800,-0.898652,1.493218,0.338334,-0.365020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27112,9.792550,-3.351503,6.869967,0.304764,3.303104,-0.960837,-2.588651,4.521898,1.489481,7.109532,...,1.725079,1.271450,-0.567695,-0.111219,0.883647,0.796527,0.354574,1.098843,-0.328075,0.063152
27113,4.715854,-4.266459,0.707244,-6.709499,1.166898,-0.377644,-6.187341,1.759470,6.464660,-0.776340,...,-0.275945,-0.230087,1.987767,-0.122264,0.829649,1.910535,0.391578,0.989922,-0.975805,-0.248757
27114,-0.030081,-5.204044,1.515163,-13.278986,3.867821,-2.989610,-0.761097,1.778814,4.786293,0.246323,...,0.558633,-0.716457,1.920569,1.483391,2.957964,1.674624,-0.832306,-1.511633,-0.321255,-0.687260
27115,-0.031049,-2.405849,0.565100,-9.701809,11.256230,-3.154013,2.138202,5.574687,5.877253,0.915709,...,-0.376467,-2.232781,1.813420,0.661504,2.585765,-0.084849,0.974733,-2.725560,-1.177815,1.648617


In [30]:
y = data2['category']

y

0        인문/사회
1        인문/사회
2        인문/사회
3        인문/사회
4           교육
         ...  
27112       공학
27113       공학
27114       공학
27115       공학
27116       공학
Name: category, Length: 27117, dtype: object

In [15]:
rfm = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=3)

rfm.fit(X.values, y.values)

joblib.dump(model, './rf_model.pkl')

In [16]:
seogang_class = [
    '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', 
    '인문/사회', '인문/사회', '인문/사회', '인문/사회', '인문/사회', 
    '예체능', '예체능',  '예체능', '예체능', '예체능', '예체능', 
    '자연', '자연', '자연', '자연', 
    '공학', '공학', '공학', '공학', '공학', '공학'
]

seogang_major = [
    '국어국문학', '사학', '철학', '종교학', '영문학부', '유럽문화', '독일문화', '프랑스문화', '중국문화', '일본문화', 
    '사회학', '정치외교학', '심리학', '경제학', '경영학', 
    '국제한국학', '아트&테크놀로지', '신문방송학', '미디어&엔터테인먼트', '글로벌 한국학', '커뮤니케이션학', 
    '수학', '물리학', '화학', '생명과학', 
    '전자공학', '화공생명공학', '컴퓨터공학', '기계공학', '인공지능학', '시스템반도체공학'
]

In [17]:
majors = []
for word in seogang_major:
    majors.append(preprocess_name(word))
    
majors

['국어국문학',
 '사학',
 '철학',
 '종교학',
 '영문학부',
 '유럽문화',
 '독일문화',
 '프랑스문화',
 '중국문화',
 '일본문화',
 '사회학',
 '정치외교학',
 '심리학',
 '경제학',
 '경영학',
 '국제한국학',
 '아트테크놀로지',
 '신문방송학',
 '미디어엔터테인먼트',
 '글로벌한국학',
 '커뮤니케이션학',
 '수학',
 '물리학',
 '화학',
 '생명과학',
 '전자공학',
 '화공생명공학',
 '컴퓨터공학',
 '기계공학',
 '인공지능학',
 '시스템반도체공학']

In [18]:
emb = model.encode(majors)

scale_emb = scaler.transform(emb)

scale_emb.shape

(31, 768)

In [23]:
tes = pca.transform(scale_emb)

tes.shape

(31, 78)

In [29]:
loaded_model = joblib.load('./rf_model.pkl')

score = loaded_model.score(tes, seogang_class)
pred = loaded_model.predict(tes)

print(pred)
print('정확도: {score:.3f}'.format(score=score))

['인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회'
 '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '인문/사회' '예체능' '인문/사회'
 '인문/사회' '인문/사회' '인문/사회' '자연' '자연' '인문/사회' '자연' '공학' '공학' '공학' '공학' '공학'
 '공학']
정확도: 0.806
