In [1]:
# 분야 2 | 유선 네트워크 장비의 신속한 조치를 위한 경보 유형 분류

# 아래의 코드는 다음과 같은 순서로 구성되어 있습니다.
# 1. 데이터 로드 및 전처리
# 2. 모델 학습 및 예측
# 3. 결과 제출

In [21]:
# 라이브러리
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import ComplementNB
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [28]:
# 데이터 로드
search_path = os.getcwd()
file_tr = "Q2_train.csv"
file_te = "Q2_test.csv"
file_samp = "Q2_label_sample.csv"

for root, dirs, files in os.walk(search_path):
    if file_tr in files:
        file_path = os.path.join(root, file_tr)
        train_df = pd.read_csv(file_path)
    if file_te in files:
        file_path = os.path.join(root, file_te)
        test_df = pd.read_csv(file_path)
    if file_samp in files:
        file_path = os.path.join(root, file_samp)
        label = pd.read_csv(file_path)

In [23]:
# 메세지 전처리 함수
def ppr_msg(s):
    s = s.lower()
    s = s.replace('(', '').replace(')', '') # 괄호 제거
    s = s.replace(':', '').replace('+', '') # ':', '+' 제거
    s = s.replace('-', ' ').replace('_', ' ') # '-', '_' 구분자 공백으로 바꾸기

    # 의미 없는 말 제거, ABC 기지국 보편적인 약어 풀이를 사용하여
    # 의미에 따라 경보 메세지 전처리
    s = s.replace('remote defect indication', 'rdi') # ex. remote rdi로 바뀔 것, defect와 indication은 의미 없는 말
    s = s.replace('remote fault indication', 'remote rfi')
    s = s.replace('fan 48v fail', 'fan fail')
    s = s.replace('48v fail', 'input pwr conn mis')
    s = s.replace('abnormal condition laser forced', 'laser force')
    s = s.replace('battery', 'batt')
    s = s.replace('dc input a fail', 'dc input fail')
    s = s.replace('dcc', 'dcc communicate channel')
    s = s.replace('dcn', 'dcc comunication network') #communicate라고 해도 결과 안달라지는지 확인
    s = s.replace('down los', 'down loss signal')
    s = s.replace('ether', 'eth')
    s = s.replace('llcf', 'loss carry forward')
    s = s.replace(' fault', '')
    s = s.replace('failure', 'fail')
    s = s.replace(' alarm', '')
    s = s.replace('improper removal', 'remove')
    s = s.replace(' defect', '')
    s = s.replace('power', 'pwr')
    s = s.replace('links', 'link')
    s = s.replace(' with', '')
    s = s.replace('loss of connectivity', 'loss connectivity')
    s = s.replace('loss of signal', 'loss signal')
    s = s.replace('mep lsp loc', 'mep lsp loss connectivity')
    s = s.replace('lsp loc', 'lsp oam loss connectivity')
    s = s.replace('manual management removal', 'mng remove')
    s = s.replace('lsp loc', 'lsp loss connectivity')
    s = s.replace('rdi', 'remote rdi')
    s = s.replace('csf', 'client signal fail')
    s = s.replace('pwe ', 'pw eth ')
    s = s.replace('pw eth loc', 'pw eth loss connectivity')
    s = s.replace(' sfp', '')
    s = s.replace(' via', '')
    s = s.replace(' primary', '')
    s = s.replace('reachable', 'reach')
    s = s.replace(' of', '')
    s = s.replace('oamloss', 'oam loss')
    s = s.replace('opt los', 'opt loss signal')
    s = s.replace('opt mis', 'opt mismatch')
    s = s.replace('trk conn mis', 'trk conn mismatch')
    s = s.replace('portshutdown', 'port down')
    s = s.replace('protectionswitched', 'protect switch')
    s = s.replace('pw loc', 'pw loss continuity')
    s = s.replace('restarted', 'restart')
    s = s.replace('stm1 los', 'sync transport module loss signal')
    s = s.replace('lof', 'loss frame')
    s = s.replace('rmt', 'remote')
    s = s.replace('unexpected', 'unexpect')
    s = s.replace(' by', '')
    s = s.replace('ipc fail', 'ipc communicate fail')
    s = s.replace('unit rem', 'unit remove')
    s = s.replace('raise', '')
    s = s.replace('synchronization', 'sync')
    s = s.replace(' signals', '')

    return s

In [24]:
# 데이터 전처리
def ppr_data(q2_train, q2_test):
    # TODO: 데이터 전처리 코드 구현 ---------- #

    # alarmmsg 전처리
    for i in range(len(q2_train)):
      s = q2_train['alarmmsg_original'].iloc[i]
      q2_train['alarmmsg_original'].iloc[i] = ppr_msg(s)
    for i in range(len(q2_test)):
      s = q2_test['alarmmsg_original'].iloc[i]
      q2_test['alarmmsg_original'].iloc[i] = ppr_msg(s)

    # slotna 변수 생성
    q2_train['slotna'] = q2_train['slot'].isna().astype(int)
    q2_test['slotna'] = q2_test['slot'].isna().astype(int)

    # alarmno 자릿수
    for i in range(len(q2_train)):
        q2_train['alarmno'][i] = int(str(q2_train['alarmno'][i])[0:6])
    for i in range(len(q2_test)):
        q2_test['alarmno'][i] = int(str(q2_test['alarmno'][i])[0:6])
    
    # 필요 없는 변수 drop
    q2_train.drop(labels = ['slot', 'port', 'sva', 'root_cause_domain'], axis = 1, inplace = True)
    q2_test.drop(labels = ['slot', 'port', 'sva', 'root_cause_domain'], axis = 1, inplace = True)

    # 열 순서 정렬
    q2_train = q2_train[[col for col in q2_train.columns if col != 'root_cause_type'] + ['root_cause_type']]

    # time 변수 datetime으로 타입 변경
    q2_train['alarmtime'] = pd.to_datetime(q2_train['alarmtime'])
    q2_test['alarmtime'] = pd.to_datetime(q2_test['alarmtime'])

    # train, test 데이터 합치기
    q2_train['train'] = 1
    q2_test['train'] = 0
    q2_train = q2_train[[col for col in q2_train.columns if col != 'root_cause_type'] + ['root_cause_type']]
    data_total = pd.concat([q2_train, q2_test])

    # alarmtime 변수로 지속시간 변수 추가 및 time변수 삭제
    data = data_total.groupby('ticketno')['alarmtime'].agg(**{'min_time':'min', 'max_time':'max'}).reset_index()
    data['duration'] = data['max_time'] - data['min_time']
    for i in range(len(data['duration'])):
      data['duration'][i] = data['duration'][i].seconds / 60

    # 전표별 level 당 개수 변수 생성
    level = data_total.groupby(['ticketno','alarmlevel']).size().unstack(fill_value = 0).reset_index()
    data['level3'] = level[3]
    data['level4'] = level[4]
    data['level5'] = level[5]
    data['level7'] = level[7]
    data.drop(labels=['max_time'], axis = 1, inplace = True)

    # 전표별 alarmmsg 이어붙이기
    msg = data_total.groupby("ticketno")["alarmmsg_original"].apply(list).reset_index()

    # 전표별 데이터 합치기
    data = pd.merge(data, msg, how='left')
    data = pd.merge(data, data_total.groupby('ticketno')['alarmno'].agg(**{'alarmno':max}).reset_index())
    data = pd.merge(data, data_total.groupby('ticketno')['train'].agg(**{'train':max}).reset_index())
    data = pd.merge(data, data_total.groupby('ticketno')['slotna'].agg(**{'slotna': max}).reset_index())
    data = pd.merge(data, data_total.groupby('ticketno')['root_cause_type'].agg(**{'type':max}).reset_index())   

    data['alarmmsg_original'] = data['alarmmsg_original'].apply(lambda msg_list: ' '.join(msg_list))
    
    # data 시간별로 sort
    data.sort_values('min_time', inplace = True)

    # train, test 분리
    data_train = data[data['train'] == 1]
    data_test = data[data['train'] == 0]
    data_train.drop(labels = ['train'], axis = 1, inplace = True)
    data_test.drop(labels = ['train'], axis = 1, inplace = True)

    # x, y 변수 분리
    x_train_df = data_train.iloc[:, :-1]
    y_train_df = data_train['type']
    x_test_df = data_test.iloc[:, :-1]

    # ------------------------------------- #
    return x_train_df, y_train_df, x_test_df


x_train_df, y_train_df, x_test_df = ppr_data(train_df, test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q2_train['alarmmsg_original'].iloc[i] = ppr_msg(s)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q2_test['alarmmsg_original'].iloc[i] = ppr_msg(s)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q2_train['alarmno'][i] = int(str(q2_train['alarmno'][i])[0:6])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q2_t

In [25]:
# 2. 모델 학습 및 예측
class MyModel:
    def __init__(self) -> None:
        self.model = None

    def train(self, x_train, y_train):
        # TODO: 모델 학습 코드 구현 ---------- #
        # TfidfVertorizer로 메시지 수치화 후 Multinomial naive bayes 모델링

        # Create transformers
        message_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc.split(), analyzer = 'char', sublinear_tf = True, max_df=0.8)
        additional_scaler = MinMaxScaler()

        # Define column indices for the ColumnTransformer
        message_col_idx = 'alarmmsg_original'
        additional_cols = ['alarmno', 'duration', 'level3', 'level4', 'level5', 'level7', 'slotna']

        # Create the ColumnTransformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('message', message_vectorizer, message_col_idx),
                ('additional', additional_scaler, additional_cols)
            ],
            remainder='passthrough'
        )

        
        # Create individual classifiers
        classifier1 = ComplementNB(alpha=11)
        classifier2 = CatBoostClassifier(n_estimators=80, random_seed = 425)  # Adjust hyperparameters


        # Create the VotingClassifier
        voting_classifier = VotingClassifier(
            estimators=[
                ('nb', classifier1),
                ('cb', classifier2)
            ],
            voting='soft',  # You can use 'hard' or 'soft' voting
            
        )
        

        # Create a pipeline with the ColumnTransformer and the classifier
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', voting_classifier)
        ])

        x_train = x_train[['alarmmsg_original'] + additional_cols]

        # Train the pipeline
        self.model = pipeline.fit(x_train, y_train)


        # --------------------------------- #
        pass

    def predict(self, x_test):
        # 1. ticketno 컬럼은 입력받은 값으로 채우고,
        # 2. pred 컬럼은 모두 'LinkCut' 값으로 채운 데이터프레임 생성
        pred_df = pd.DataFrame({'ticketno': x_test['ticketno'].values, 'root_cause_type': self.model.predict(x_test)})
        pred_df.sort_values('ticketno', inplace=True)
        pred_df.reset_index(drop=True, inplace=True)
        return pred_df

model = MyModel()
model.train(x_train_df, y_train_df)
y_pred = model.predict(x_test_df) # 0.935752



Learning rate set to 0.5
0:	learn: 0.4577124	total: 66ms	remaining: 5.21s
1:	learn: 0.2766524	total: 67.8ms	remaining: 2.64s
2:	learn: 0.1816883	total: 70ms	remaining: 1.8s
3:	learn: 0.1323200	total: 71.7ms	remaining: 1.36s
4:	learn: 0.0978221	total: 73.4ms	remaining: 1.1s
5:	learn: 0.0749183	total: 74.9ms	remaining: 924ms
6:	learn: 0.0613773	total: 76.3ms	remaining: 796ms
7:	learn: 0.0521487	total: 78.1ms	remaining: 703ms
8:	learn: 0.0460331	total: 83.7ms	remaining: 660ms
9:	learn: 0.0420598	total: 92.3ms	remaining: 646ms
10:	learn: 0.0392565	total: 94.1ms	remaining: 591ms
11:	learn: 0.0359321	total: 96ms	remaining: 544ms
12:	learn: 0.0326422	total: 97.8ms	remaining: 504ms
13:	learn: 0.0308531	total: 99.5ms	remaining: 469ms
14:	learn: 0.0279695	total: 101ms	remaining: 439ms
15:	learn: 0.0265470	total: 103ms	remaining: 414ms
16:	learn: 0.0251256	total: 106ms	remaining: 394ms
17:	learn: 0.0228155	total: 113ms	remaining: 389ms
18:	learn: 0.0209796	total: 115ms	remaining: 368ms
19:	learn:

In [26]:
print(list(y_pred['root_cause_type']).count('LinkCut'))
print(list(y_pred['root_cause_type']).count('PowerFail'))
print(list(y_pred['root_cause_type']).count('UnitFail'))

2571
1563
193


In [29]:
# 3. 결과 제출
# 본 코드는 제출되는 파일의 형태에 대한 가이드로, 반드시 아래 구조를 따를 필요 없이 자유롭게 코드를 작성해도 무방합니다.
# 제출 포맷에 대해선 data/Q2_label_sample.csv를 참조하세요.
#
# 분야 2의 경우, 전표(ticket) 하나에 하나의 근원장애(root_cause_type)을 매칭해야 합니다.
#   주의: 경보(alarm) 개수와 전표(ticket) 개수는 다르며, 예측할 대상은 전표입니다.
#   주의: ticketno 컬럼 기준으로 오름차순 정렬이 필요합니다.
# 분야 2의 제출 파일은 2개 컬럼 [ticketno, root_cause_type]을 가져야 합니다.

def submitResult(pred):
    try:
        global label
        # ticketno 순서와 개수가 일치하는지 확인
        if (label['ticketno'] == pred['ticketno']).all():
            print("Check: ticketno 순서와 샘플 수가 일치합니다.")
        else:
            print("Warning: 테스트 세트와 모델 예측의 ticketno가 일치하지 않습니다.")
            return

        pred.to_csv('Q2_submitResult_자릿수_ran=425_alpha=13_n_es=87_cnb.csv', index=False)
        print("Done : Q2_submitResult.csv 파일로 저장되었습니다.")
    except Exception as e:
        # 예외가 발생한 경우 오류 메시지 출력
        print("Error:", e)

submitResult(y_pred)

Check: ticketno 순서와 샘플 수가 일치합니다.
Done : Q2_submitResult.csv 파일로 저장되었습니다.
