In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
seed_value = 777

In [76]:
import pandas as pd
import numpy as np
import random
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight


from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

import joblib

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy

from tensorflow.keras.models import save_model
from tensorflow.keras.models import load_model

tf.random.set_seed(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [77]:
# column의 각 행이 동일한 데이터로 되어있는지 확인하는 함수
# ex) a,b,c로 되어있는 행에서 a=b=c인지 확인, 다르면 list에 append

def different_value_count(df, column):
    different_label = []

    for i in range(df.shape[0]):
        type_value = df.loc[df.index[i], column]
        if len(set(type_value)) > 1:
            different_label.append(df.index[i])

    print(len(different_label))

In [78]:
# column의 각 행이 동일한 데이터로 되어있는지 확인하는 함수
# ex) a,b,c로 되어있는 행에서 a=b=c인지 확인

def different_value_df(df, column):
    different_index = []

    for i in range(df.shape[0]):
        type_value = df.loc[df.index[i], column]
        if len(set(type_value)) > 1:
            different_index.append(df.index[i])

    different_label = pd.DataFrame(different_index, columns = ['ticketno'])
    different_label = pd.merge(different_label, df[column], how = 'inner', on = 'ticketno')

    return different_label

In [79]:
# root_cause_type의 각 행의 값이 동일하면 맨 처음 값으로 변경되는 함수
# ex) a=b=c ==> a로 변경

def replace_root_cause_type(row):
    if 'PowerFail' in row:
        return 'PowerFail'
    elif 'UnitFail' in row:
        return 'UnitFail'
    elif 'LinkCut' in row:
        return 'LinkCut'
    return row

In [80]:
def unpack_list(lst):
    """
    list를 str로 변환하는 함수

    Args:
        lst (list): str로 변환하고 싶은 list

    Returns:
        str로 변환된 list의 요소들을 ','로 연결시킨 문자열
    """

    return ','.join(str(x) for x in lst)

In [81]:
label_sample_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_label_sample.csv')
train_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_train.csv')
test_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_test.csv')

In [82]:
train_change_msg = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/train_map_3.csv')
test_change_msg = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/test_map_3.csv')

In [83]:
print(label_sample_org.shape)
print(train_org.shape)
print(test_org.shape)

(4327, 2)
(9322, 13)
(37671, 12)


In [84]:
print(train_change_msg.shape)
print(test_change_msg.shape)

(9322, 13)
(37671, 12)


# **train, test divide**

In [85]:
# 필요 컬럼들만 모아 새로운 df 생성

col = ['ticketno', 'alarmtime', 'alarmlevel', 'alarmmsg_original', 'root_cause_type']
train_divide = train_change_msg[col]
train_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,root_cause_type
0,21122633.0,2022-12-01 00:02:24+09:00,5,ETHERNET-ERROR,LinkCut
1,21122633.0,2022-12-01 00:17:15+09:00,5,ETHERNET-ERROR,LinkCut
2,21122633.0,2022-12-01 00:32:11+09:00,5,ETHERNET-ERROR,LinkCut
3,21122633.0,2022-12-01 00:47:10+09:00,5,ETHERNET-ERROR,LinkCut
4,21122633.0,2022-12-01 01:02:24+09:00,5,ETHERNET-ERROR,LinkCut


In [86]:
# ticketno / alarmtime / alarmlevel 별로 정렬(sort)

train_divide = train_divide.sort_values(['ticketno', 'alarmtime', 'alarmlevel', 'root_cause_type'],
                                        ascending = [True, True, False, True])
train_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,root_cause_type
1087,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,PowerFail
1088,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,PowerFail
1089,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,PowerFail
1085,14753084.0,2022-12-02 13:13:48+09:00,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
1086,14753084.0,2022-12-02 13:13:48+09:00,4,POWER-SUPPLY-UNIT-FAIL,PowerFail


In [87]:
test_col = ['ticketno', 'alarmtime', 'alarmlevel', 'alarmmsg_original']

test_divide = test_change_msg[test_col]
test_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original
0,21812391.0,2022-12-25 00:02:16+09:00,5,LOSS-OF-SIGNAL
1,21775988.0,2022-12-25 00:02:51+09:00,5,LOSS-OF-SIGNAL
2,21792259.0,2022-12-25 00:03:22+09:00,4,ALARM-RECEIVE-POWER-HIGH
3,21812412.0,2022-12-25 00:03:33+09:00,5,BATTERY-ENVIRONMENT-FAIL
4,21812417.0,2022-12-25 00:03:39+09:00,5,LOSS-OF-SIGNAL


In [88]:
test_divide = test_divide.sort_values(['ticketno', 'alarmtime', 'alarmlevel'],
                                      ascending = [True, True, False])
test_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original
7438,15238899.0,2022-12-26 13:29:12+09:00,7,ETHERNET-LINK-FAIL
7439,15238899.0,2022-12-26 13:29:12+09:00,7,ETHERNET-LINK-FAIL
7436,15238899.0,2022-12-26 13:29:12+09:00,4,POWER-SUPPLY-UNIT-FAIL
7437,15238899.0,2022-12-26 13:29:12+09:00,4,POWER-SUPPLY-UNIT-FAIL
2769,15712444.0,2022-12-25 16:34:07+09:00,4,POWER-SUPPLY-UNIT-FAIL


# **중복제거**

## **train**

In [89]:
# ticketno 기준으로 중복된 행이 있으면 제일 처음꺼만 남기고 제거

column = ['ticketno','alarmlevel','alarmmsg_original','root_cause_type']
train_first = train_divide[column].drop_duplicates(keep = 'first')
train_first.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,root_cause_type
1087,14753084.0,7,ETHERNET-LINK-FAIL,PowerFail
1085,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
1090,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,PowerFail
1728,14771766.0,7,ETHERNET-LINK-FAIL,PowerFail
1726,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail


In [90]:
# ticketno를 그룹화하여 level과 msg 리스트로 만들기

train_first = pd.DataFrame(train_first.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'root_cause_type']].agg(list))
train_first.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14753084.0,"[7, 4, 5]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL, E...","[PowerFail, PowerFail, PowerFail]"
14771766.0,"[7, 4]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL]","[PowerFail, PowerFail]"
14777089.0,"[7, 4]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL]","[PowerFail, PowerFail]"
14790052.0,[4],[POWER-SUPPLY-UNIT-FAIL],[PowerFail]
14879922.0,"[7, 4, 5]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL, E...","[PowerFail, PowerFail, PowerFail]"


In [91]:
max_count_ticketno = max(train_first['alarmlevel'].apply(len))
max_count_ticketno

6

In [92]:
# root_cause_type의 변경 값이 없는지 확인
different_value_count(train_first, 'root_cause_type')

0


In [93]:
# map(lambda)를 사용하여 리스트 풀기
train_first = pd.DataFrame(train_first.applymap(lambda x: unpack_list(x)))

print(len(train_first))
train_first.head(10)

1114


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14753084.0,745,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE...","PowerFail,PowerFail,PowerFail"
14771766.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL","PowerFail,PowerFail"
14777089.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL","PowerFail,PowerFail"
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
14879922.0,745,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE...","PowerFail,PowerFail,PowerFail"
14901137.0,745,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE...","PowerFail,PowerFail,PowerFail"
14919180.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL","PowerFail,PowerFail"
14922559.0,445,"48V-FAIL,FAN-FAIL,48V-FAIL","PowerFail,PowerFail,PowerFail"
14999487.0,77,"OPTICAL-REMOVE,OPTICAL-LOSS-OF-SIGNAL","UnitFail,UnitFail"
15036874.0,7,NON-VOLATILE-RANDOM-ACCESS-MEMORY-FAIL,UnitFail


In [94]:
train_first_reset = train_first.reset_index()

train_first_reset.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,root_cause_type
0,14753084.0,745,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE...","PowerFail,PowerFail,PowerFail"
1,14771766.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL","PowerFail,PowerFail"
2,14777089.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL","PowerFail,PowerFail"
3,14790052.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
4,14879922.0,745,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE...","PowerFail,PowerFail,PowerFail"


In [95]:
# 새로운 데이터프레임 생성을 위한 빈 리스트
new_rows = []

# 데이터프레임 순회하며 ','로 분리된 값을 새로운 행으로 추가
for _, row in train_first_reset.iterrows():
    alarmlevels = row['alarmlevel'].split(',')
    alarmmsgs = row['alarmmsg_original'].split(',')
    root_cause_types = row['root_cause_type'].split(',')

    # 분리된 값들을 새로운 행으로 추가
    for i in range(len(alarmlevels)):
        new_row = {
            'ticketno': row['ticketno'],
            'alarmlevel': alarmlevels[i],
            'alarmmsg_original': alarmmsgs[i],
            'root_cause_type': root_cause_types[i]
        }
        new_rows.append(new_row)

# 새로운 데이터프레임 생성
train_level_desc = pd.DataFrame(new_rows)

# 결과 출력
train_level_desc.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,root_cause_type
0,14753084.0,7,ETHERNET-LINK-FAIL,PowerFail
1,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
2,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,PowerFail
3,14771766.0,7,ETHERNET-LINK-FAIL,PowerFail
4,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail


In [96]:
train_level_asc = train_level_desc.sort_values(['ticketno', 'alarmlevel', 'root_cause_type'])
train_level_asc.head(10)

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,root_cause_type
1,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
2,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,PowerFail
0,14753084.0,7,ETHERNET-LINK-FAIL,PowerFail
4,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
3,14771766.0,7,ETHERNET-LINK-FAIL,PowerFail
6,14777089.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
5,14777089.0,7,ETHERNET-LINK-FAIL,PowerFail
7,14790052.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
9,14879922.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
10,14879922.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,PowerFail


In [97]:
train_level_asc = pd.DataFrame(train_level_asc.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'root_cause_type']].agg(list))
train_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14753084.0,"[4, 5, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T...","[PowerFail, PowerFail, PowerFail]"
14771766.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[PowerFail, PowerFail]"
14777089.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[PowerFail, PowerFail]"
14790052.0,[4],[POWER-SUPPLY-UNIT-FAIL],[PowerFail]
14879922.0,"[4, 5, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T...","[PowerFail, PowerFail, PowerFail]"


In [98]:
train_level_asc = pd.DataFrame(train_level_asc.applymap(lambda x: unpack_list(x)))

print(len(train_level_asc))
train_level_asc.head(10)

1114


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14753084.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","PowerFail,PowerFail,PowerFail"
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","PowerFail,PowerFail"
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","PowerFail,PowerFail"
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
14879922.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","PowerFail,PowerFail,PowerFail"
14901137.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","PowerFail,PowerFail,PowerFail"
14919180.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","PowerFail,PowerFail"
14922559.0,445,"48V-FAIL,FAN-FAIL,48V-FAIL","PowerFail,PowerFail,PowerFail"
14999487.0,77,"OPTICAL-REMOVE,OPTICAL-LOSS-OF-SIGNAL","UnitFail,UnitFail"
15036874.0,7,NON-VOLATILE-RANDOM-ACCESS-MEMORY-FAIL,UnitFail


In [99]:
# 변경되는 값이 없으므로 root_cause_type을 한개의 값으로 변경

train_level_asc['root_cause_type'] = train_level_asc['root_cause_type'].apply(replace_root_cause_type)
train_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14753084.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...",PowerFail
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL",PowerFail
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL",PowerFail
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,PowerFail
14879922.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...",PowerFail


In [100]:
# FastText를 위한 전처리를 완료한 파일

train_level_asc.to_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/train_FT.csv', index = False)

## **test**

In [101]:
# ticketno 기준으로 중복된 행이 있으면 제일 처음꺼만 남기고 제거

col = ['ticketno','alarmlevel','alarmmsg_original']
test_first = test_divide[col].drop_duplicates(keep = 'first')
test_first.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original
7438,15238899.0,7,ETHERNET-LINK-FAIL
7436,15238899.0,4,POWER-SUPPLY-UNIT-FAIL
2769,15712444.0,4,POWER-SUPPLY-UNIT-FAIL
5965,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC
6082,15723187.0,4,POWER-SUPPLY-UNIT-FAIL


In [102]:
test_first = pd.DataFrame(test_first.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original']].agg(list))
test_first.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1
15238899.0,"[7, 4]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL]"
15712444.0,[4],[POWER-SUPPLY-UNIT-FAIL]
15723187.0,"[5, 4]","[ETHERNET-NO-RECEIVE-TRAFFIC, POWER-SUPPLY-UNI..."
15737103.0,"[7, 4]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL]"
15737132.0,"[7, 4]","[ETHERNET-LINK-FAIL, POWER-SUPPLY-UNIT-FAIL]"


In [103]:
max_count_ticketno = max(test_first['alarmlevel'].apply(len))
max_count_ticketno

8

In [104]:
test_first = pd.DataFrame(test_first.applymap(lambda x: unpack_list(x)))

print(len(test_first))
test_first.head(10)

4327


Unnamed: 0_level_0,alarmlevel,alarmmsg_original
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1
15238899.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"
15712444.0,4,POWER-SUPPLY-UNIT-FAIL
15723187.0,54,"ETHERNET-NO-RECEIVE-TRAFFIC,POWER-SUPPLY-UNIT-..."
15737103.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"
15737132.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"
16188560.0,4,BOOTING
16199806.0,4,POWER-SUPPLY-UNIT-FAIL
16201784.0,7455,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL,ETHE..."
16203726.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE..."
16237988.0,7,OPTICAL-REMOVE


In [105]:
test_first_reset = test_first.reset_index()

test_first_reset.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original
0,15238899.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"
1,15712444.0,4,POWER-SUPPLY-UNIT-FAIL
2,15723187.0,54,"ETHERNET-NO-RECEIVE-TRAFFIC,POWER-SUPPLY-UNIT-..."
3,15737103.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"
4,15737132.0,74,"ETHERNET-LINK-FAIL,POWER-SUPPLY-UNIT-FAIL"


In [106]:
# 새로운 데이터프레임 생성을 위한 빈 리스트
new_rows = []

# 데이터프레임 순회하며 ','로 분리된 값을 새로운 행으로 추가
for _, row in test_first_reset.iterrows():
    alarmlevels = row['alarmlevel'].split(',')
    alarmmsgs = row['alarmmsg_original'].split(',')


    # 분리된 값들을 새로운 행으로 추가
    for i in range(len(alarmlevels)):
        new_row = {
            'ticketno': row['ticketno'],
            'alarmlevel': alarmlevels[i],
            'alarmmsg_original': alarmmsgs[i]
        }
        new_rows.append(new_row)

# 새로운 데이터프레임 생성
test_level_desc = pd.DataFrame(new_rows)

# 결과 출력
test_level_desc.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original
0,15238899.0,7,ETHERNET-LINK-FAIL
1,15238899.0,4,POWER-SUPPLY-UNIT-FAIL
2,15712444.0,4,POWER-SUPPLY-UNIT-FAIL
3,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC
4,15723187.0,4,POWER-SUPPLY-UNIT-FAIL


In [107]:
test_level_asc = test_level_desc.sort_values(['ticketno', 'alarmlevel'])
test_level_asc.head(10)

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original
1,15238899.0,4,POWER-SUPPLY-UNIT-FAIL
0,15238899.0,7,ETHERNET-LINK-FAIL
2,15712444.0,4,POWER-SUPPLY-UNIT-FAIL
4,15723187.0,4,POWER-SUPPLY-UNIT-FAIL
3,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC
6,15737103.0,4,POWER-SUPPLY-UNIT-FAIL
5,15737103.0,7,ETHERNET-LINK-FAIL
8,15737132.0,4,POWER-SUPPLY-UNIT-FAIL
7,15737132.0,7,ETHERNET-LINK-FAIL
9,16188560.0,4,BOOTING


In [108]:
test_level_asc = pd.DataFrame(test_level_asc.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original']].agg(list))
test_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1
15238899.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]"
15712444.0,[4],[POWER-SUPPLY-UNIT-FAIL]
15723187.0,"[4, 5]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T..."
15737103.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]"
15737132.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]"


In [109]:
test_level_asc = pd.DataFrame(test_level_asc.applymap(lambda x: unpack_list(x)))

print(len(test_level_asc))
test_level_asc.head(10)

4327


Unnamed: 0_level_0,alarmlevel,alarmmsg_original
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1
15238899.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL"
15712444.0,4,POWER-SUPPLY-UNIT-FAIL
15723187.0,45,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA..."
15737103.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL"
15737132.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL"
16188560.0,4,BOOTING
16199806.0,4,POWER-SUPPLY-UNIT-FAIL
16201784.0,4557,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA..."
16203726.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA..."
16237988.0,7,OPTICAL-REMOVE


In [110]:
# FastText를 위한 전처리를 완료한 파일

test_level_asc.to_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/test_FT.csv', index = False)

## **FastText**

In [111]:
train = train_level_asc.copy()
test = test_level_asc.copy()

In [112]:
msg_first = train['alarmmsg_original']
msg_first.head()

ticketno
14753084.0    POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...
14771766.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
14777089.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
14790052.0                               POWER-SUPPLY-UNIT-FAIL
14879922.0    POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...
Name: alarmmsg_original, dtype: object

In [113]:
# ','로 분리된 단어들을 ' '로 바꿔서 메세지 한개를 단어로 인식할 수 있도록 변경

msg_first = [x.replace(',', ' ') for x in msg_first]
msg_first = [x.replace('-', ' ') for x in msg_first]

In [114]:
tokenized_texts_first = [text.split() for text in msg_first]

In [115]:
len(tokenized_texts_first)

1114

In [116]:
words = tokenized_texts_first

model_first = FastText(words, window = 10, min_count = 0, workers = -1, sg = 1)



In [117]:
# 벡터화 된 단어 확인

model_first.wv['EHT-ERR']

array([-2.98308238e-04, -2.90916720e-03, -1.99219189e-03, -1.02379278e-03,
       -1.42693555e-03,  1.80005166e-03, -1.84901548e-03, -1.15518563e-03,
       -1.59849413e-03,  2.02659308e-03, -2.03274982e-03,  1.34935242e-03,
       -1.82648108e-03, -7.95911648e-04,  1.21864560e-03,  9.87796346e-04,
       -1.51766635e-05,  2.02852930e-03,  1.32197747e-03,  1.11352466e-03,
        3.36987323e-05, -3.55965283e-04, -8.08430428e-04,  1.24013529e-03,
       -5.95350808e-04, -6.18702790e-04, -5.54523489e-04, -3.18957586e-03,
       -1.41281204e-03,  1.36490143e-03,  1.78452348e-03, -1.13594579e-03,
       -8.43517308e-04,  4.47471160e-04,  1.25209952e-03, -1.35960011e-03,
       -1.27366674e-03,  1.81193915e-04, -6.87094172e-04,  1.11164898e-03,
       -1.16194750e-03,  1.81075142e-04,  1.45211595e-03,  2.53696321e-03,
        8.92959593e-04, -1.45842045e-04,  9.90469242e-04,  1.76626409e-03,
        3.33967881e-04,  4.40545962e-04, -1.85886992e-03, -6.03437249e-04,
       -4.10742476e-04,  

In [118]:
first_vectors = []

for tokens in tokenized_texts_first:
    vectors_first = [model_first.wv[word] for word in tokens if word in model_first.wv]
    if vectors_first:
        first_vector = np.mean(vectors_first, axis = 0)
    else:
        first_vector = np.zeros(model_first.vector_size)    # OOV 처리

    first_vectors.append(first_vector)

In [119]:
# 1114개

len(first_vectors)

1114

In [120]:
train['alarmmsg_original'] = first_vectors

In [121]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 1114 entries, 14753084.0 to 21811213.0
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   alarmlevel         1114 non-null   object
 1   alarmmsg_original  1114 non-null   object
 2   root_cause_type    1114 non-null   object
dtypes: object(3)
memory usage: 34.8+ KB


In [122]:
train.iloc[0, 1]

array([-1.20817829e-04, -1.29267061e-03,  2.08125668e-04,  5.92597062e-04,
       -2.97311810e-04,  7.19272357e-05,  6.80404191e-04,  4.14468610e-04,
        1.06011471e-03, -6.14564808e-04,  4.99021495e-04,  3.07530572e-04,
        9.06044559e-04, -6.41589868e-04,  4.94912034e-04, -3.43761669e-04,
       -2.74642487e-04, -3.65164800e-04, -5.25401381e-04,  1.75082168e-04,
       -9.22895444e-04, -6.86691259e-04,  1.19159017e-06,  4.16426075e-04,
       -5.82643261e-04, -7.72113199e-05, -3.92790571e-05,  5.97158214e-04,
       -4.24227292e-05, -7.33651919e-04,  8.99725594e-04, -8.43916496e-04,
       -2.73290818e-04,  3.03285633e-04,  2.62016227e-04,  5.42737136e-04,
        4.95166692e-04, -4.86351637e-04, -2.94965634e-04,  5.77447470e-04,
       -7.50771025e-04,  1.38299758e-04,  1.41923374e-04,  6.94612681e-04,
       -7.24953250e-04, -2.87758739e-04,  2.34114646e-04, -7.32119370e-05,
        1.04732921e-04, -4.62664204e-04, -9.12120959e-05, -2.20984439e-04,
        4.21395031e-04, -

## **Test embeding - 중복 제거 (keep = 'first')**

In [123]:
msg_test_first = test['alarmmsg_original']

In [124]:
msg_test_first = [x.replace(',', ' ') for x in msg_test_first]
msg_test_first = [x.replace('-', ' ') for x in msg_test_first]

In [125]:
tokenized_texts_first_test = [text.split() for text in msg_test_first]

In [126]:
first_vectors_test = []

for tokens in tokenized_texts_first_test:
    vectors_first_test = [model_first.wv[word] for word in tokens if word in model_first.wv]
    if vectors_first_test:
        first_vector_test = np.mean(vectors_first_test, axis = 0)
    else:
        first_vector_test = np.zeros(model_first.vector_size)  # Out-of-vocabulary 단어 처리

    first_vectors_test.append(first_vector_test)

In [127]:
# 4327개

len(first_vectors_test)

4327

In [128]:
test['alarmmsg_original'] = first_vectors_test

In [129]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 4327 entries, 15238899.0 to 26067480.0
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   alarmlevel         4327 non-null   object
 1   alarmmsg_original  4327 non-null   object
dtypes: object(2)
memory usage: 101.4+ KB


## **lightGMB**

In [130]:
class_labels = ['LinkCut', 'PowerFail', 'UnitFail']
class_weights = compute_class_weight('balanced', classes = class_labels, y = train_level_asc['root_cause_type'])

class_weight_dict = {class_labels[i]: class_weights[i] for i in range(len(class_labels))}

In [131]:
y = train_level_asc['root_cause_type']

In [132]:
X_train, X_validation, y_train, y_validation = train_test_split(first_vectors, train['root_cause_type'], stratify = y, test_size = 0.1, random_state = 777)
X_val, X_test, y_val, y_test = train_test_split(X_validation, y_validation, stratify = y_validation, test_size = 0.5, random_state = 777)

In [133]:
lgbm = LGBMClassifier(objective = 'multiclass', class_weight = class_weight_dict, random_state = 777, verbose = -1)

lgbm.fit(X_train, y_train)

# 테스트 데이터로 예측 수행
predict_val = lgbm.predict(X_val)
acc_score = accuracy_score(y_val, predict_val)

predict_test = lgbm.predict(X_test)
acc_score_test = accuracy_score(y_test, predict_test)

print('validation accuracy : {0:.4f}'.format(acc_score))
print('test accuracy : {0:.4f}'.format(acc_score_test))

validation accuracy : 1.0000
test accuracy : 0.9643


In [134]:
test_predict = lgbm.predict(first_vectors_test)
test_predict

array(['PowerFail', 'PowerFail', 'PowerFail', ..., 'LinkCut', 'PowerFail',
       'LinkCut'], dtype=object)

In [135]:
label_sample = label_sample_org.copy()

In [136]:
label_sample['root_cause_type'] = test_predict
label_sample.head()

Unnamed: 0,ticketno,root_cause_type
0,15238899.0,PowerFail
1,15712444.0,PowerFail
2,15723187.0,PowerFail
3,15737103.0,PowerFail
4,15737132.0,PowerFail


In [137]:
label_sample['root_cause_type'].value_counts()/len(label_sample['root_cause_type'])

LinkCut      0.636469
PowerFail    0.329327
UnitFail     0.034204
Name: root_cause_type, dtype: float64