In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install fasttext

In [None]:
seed_value = 777

In [None]:
import fasttext

import pandas as pd
import numpy as np
import random
import statsmodels as sm
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight


from sklearn.metrics import accuracy_score

from lightgbm import LGBMClassifier

import gensim
from gensim.models import Word2Vec
from gensim.models import FastText

import joblib

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy, CategoricalCrossentropy

from tensorflow.keras.models import save_model
from tensorflow.keras.models import load_model

tf.random.set_seed(seed_value)
np.random.seed(seed_value)
random.seed(seed_value)

In [None]:
label_sample_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_label_sample.csv')
train_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_train.csv')
test_org = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/Q2_test.csv')

In [None]:
print(label_sample_org.shape)
print(train_org.shape)
print(test_org.shape)

(4327, 2)
(9322, 13)
(37671, 12)


In [None]:
train_change_msg= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/train_map_3.csv')
test_change_msg = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/백지짱_분야2/Submit/data/test_map_3.csv')

In [None]:
print(train_change_msg.shape)
print(test_change_msg.shape)

(9322, 13)
(37671, 12)


# **중복 제거**

In [None]:
# column의 각 행이 동일한 데이터로 되어있는지 확인하는 함수
# ex) a,b,c로 되어있는 행에서 a=b=c인지 확인, 다르면 list에 append

def different_value_count(df, column):
    different_label = []

    for i in range(df.shape[0]):
        type_value = df.loc[df.index[i], column]
        if len(set(type_value)) > 1:
            different_label.append(df.index[i])

    print(len(different_label))

In [None]:
# column의 각 행이 동일한 데이터로 되어있는지 확인하는 함수
# ex) a,b,c로 되어있는 행에서 a=b=c인지 확인

def different_value_df(df, column):
    different_index = []

    for i in range(df.shape[0]):
        type_value = df.loc[df.index[i], column]
        if len(set(type_value)) > 1:
            different_index.append(df.index[i])

    different_label = pd.DataFrame(different_index, columns = ['ticketno'])
    different_label = pd.merge(different_label, df[column], how = 'inner', on = 'ticketno')

    return different_label

In [None]:
# root_cause_type의 각 행의 값이 동일하면 맨 처음 값으로 변경되는 함수
# ex) a=b=c ==> a로 변경

def replace_root_cause_type(row):
    if 'PowerFail' in row:
        return 'PowerFail'
    elif 'UnitFail' in row:
        return 'UnitFail'
    elif 'LinkCut' in row:
        return 'LinkCut'
    return row

In [None]:
def unpack_list(lst):
    """
    list를 str로 변환하는 함수

    Args:
        lst (list): str로 변환하고 싶은 list

    Returns:
        str로 변환된 list의 요소들을 ','로 연결시킨 문자열
    """

    return ','.join(str(x) for x in lst)

## **train, test divide**

In [None]:
# 필요 컬럼들만 모아 새로운 df 생성

col = ['ticketno', 'alarmtime', 'alarmlevel', 'alarmmsg_original','site', 'root_cause_type']
train_divide = train_change_msg[col]
train_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,site,root_cause_type
0,21122633.0,2022-12-01 00:02:24+09:00,5,ETHERNET-ERROR,ACEN,LinkCut
1,21122633.0,2022-12-01 00:17:15+09:00,5,ETHERNET-ERROR,ACEN,LinkCut
2,21122633.0,2022-12-01 00:32:11+09:00,5,ETHERNET-ERROR,ACEN,LinkCut
3,21122633.0,2022-12-01 00:47:10+09:00,5,ETHERNET-ERROR,ACEN,LinkCut
4,21122633.0,2022-12-01 01:02:24+09:00,5,ETHERNET-ERROR,ACEN,LinkCut


In [None]:
# ticketno / alarmtime / alarmlevel 별로 정렬(sort)

train_divide = train_divide.sort_values(['ticketno', 'alarmtime', 'alarmlevel', 'site', 'root_cause_type'])
train_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,site,root_cause_type
1085,14753084.0,2022-12-02 13:13:48+09:00,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
1086,14753084.0,2022-12-02 13:13:48+09:00,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
1087,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,ABNE,PowerFail
1088,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,ABNE,PowerFail
1089,14753084.0,2022-12-02 13:13:48+09:00,7,ETHERNET-LINK-FAIL,ABNE,PowerFail


In [None]:
test_col = ['ticketno', 'alarmtime', 'alarmlevel', 'alarmmsg_original','site']

test_divide = test_change_msg[test_col]
test_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,site
0,21812391.0,2022-12-25 00:02:16+09:00,5,LOSS-OF-SIGNAL,AEAQ
1,21775988.0,2022-12-25 00:02:51+09:00,5,LOSS-OF-SIGNAL,ADZW
2,21792259.0,2022-12-25 00:03:22+09:00,4,ALARM-RECEIVE-POWER-HIGH,AECE
3,21812412.0,2022-12-25 00:03:33+09:00,5,BATTERY-ENVIRONMENT-FAIL,ACCN
4,21812417.0,2022-12-25 00:03:39+09:00,5,LOSS-OF-SIGNAL,AEAQ


In [None]:
test_divide = test_divide.sort_values(['ticketno', 'alarmtime', 'alarmlevel','site'])
test_divide.head()

Unnamed: 0,ticketno,alarmtime,alarmlevel,alarmmsg_original,site
7436,15238899.0,2022-12-26 13:29:12+09:00,4,POWER-SUPPLY-UNIT-FAIL,ABMY
7437,15238899.0,2022-12-26 13:29:12+09:00,4,POWER-SUPPLY-UNIT-FAIL,ABMY
7438,15238899.0,2022-12-26 13:29:12+09:00,7,ETHERNET-LINK-FAIL,ABMY
7439,15238899.0,2022-12-26 13:29:12+09:00,7,ETHERNET-LINK-FAIL,ABMY
2769,15712444.0,2022-12-25 16:34:07+09:00,4,POWER-SUPPLY-UNIT-FAIL,AEOK


## **train**

In [None]:
# ticketno 기준으로 중복된 행이 있으면 제일 마지막꺼만 남기고 제거

column = ['ticketno','alarmlevel','alarmmsg_original', 'site', 'root_cause_type']
train_last = train_divide[column].drop_duplicates(keep = 'last')
train_last.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site,root_cause_type
1086,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
1089,14753084.0,7,ETHERNET-LINK-FAIL,ABNE,PowerFail
1090,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ABNE,PowerFail
1727,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,ABLB,PowerFail
1729,14771766.0,7,ETHERNET-LINK-FAIL,ABLB,PowerFail


In [None]:
# ticketno를 그룹화하여 level과 msg 리스트로 만들기

train_last = pd.DataFrame(train_last.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'site', 'root_cause_type']].agg(list))
train_last.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,"[4, 7, 5]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL, E...","[ABNE, ABNE, ABNE]","[PowerFail, PowerFail, PowerFail]"
14771766.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABLB, ABLB]","[PowerFail, PowerFail]"
14777089.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABLZ, ABLZ]","[PowerFail, PowerFail]"
14790052.0,[4],[POWER-SUPPLY-UNIT-FAIL],[ACRX],[PowerFail]
14879922.0,"[4, 7, 5]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL, E...","[ABNE, ABNE, ABNE]","[PowerFail, PowerFail, PowerFail]"


In [None]:
max_count_ticketno = max(train_last['alarmlevel'].apply(len))
max_count_ticketno

10

In [None]:
# root_cause_type의 변경 값이 없는지 확인
different_value_count(train_last, 'root_cause_type')

0


In [None]:
# site 변경 값이 없는지 확인

different_value_count(train_last, 'site')

8


In [None]:
# 다른 site를 갖고 있는 data

different_value_df(train_last, 'site')

Unnamed: 0,ticketno,site
0,15885517.0,"[AALT, ADTP, ADPA, AALT, ADTP, ADPA, AALT, ADT..."
1,17872258.0,"[ADMK, AFSV]"
2,18390914.0,"[AAPL, AFIC, AFIC, AAPL, AAPL]"
3,18392602.0,"[AFIC, AAPL, AAPL, AFIC, AFIC]"
4,18522636.0,"[ADRW, ADWR, ADWR]"
5,18843859.0,"[AEMN, AEMN, AECB, AEMN, AEMN]"
6,21257521.0,"[ADML, ADML, ADML, AEWR, ADML, AEBW, AEBW, AEBW]"
7,21667140.0,"[AEJN, AEJN, AEJN, AEPT, AEPT, AEPT]"


In [None]:
# map(lambda)를 사용하여 리스트 풀기
train_last = pd.DataFrame(train_last.applymap(lambda x: unpack_list(x)))

print(len(train_last))
train_last.head(10)

1114


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLB,ABLB","PowerFail,PowerFail"
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLZ,ABLZ","PowerFail,PowerFail"
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
14879922.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14901137.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14919180.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABOJ,ABOJ","PowerFail,PowerFail"
14922559.0,445,"FAN-FAIL,48V-FAIL,48V-FAIL","AEIK,AEIK,AEIK","PowerFail,PowerFail,PowerFail"
14999487.0,77,"OPTICAL-REMOVE,OPTICAL-LOSS-OF-SIGNAL","ADWN,ADWN","UnitFail,UnitFail"
15036874.0,7,NON-VOLATILE-RANDOM-ACCESS-MEMORY-FAIL,ADTI,UnitFail


In [None]:
train_last_reset = train_last.reset_index()

train_last_reset.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site,root_cause_type
0,14753084.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
1,14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLB,ABLB","PowerFail,PowerFail"
2,14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLZ,ABLZ","PowerFail,PowerFail"
3,14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
4,14879922.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"


In [None]:
# 새로운 데이터프레임 생성을 위한 빈 리스트
new_rows = []

# 데이터프레임 순회하며 ','로 분리된 값을 새로운 행으로 추가
for _, row in train_last_reset.iterrows():
    alarmlevels = row['alarmlevel'].split(',')
    alarmmsgs = row['alarmmsg_original'].split(',')
    sites = row['site'].split(',')
    root_cause_types = row['root_cause_type'].split(',')

    # 분리된 값들을 새로운 행으로 추가
    for i in range(len(alarmlevels)):
        new_row = {
            'ticketno': row['ticketno'],
            'alarmlevel': alarmlevels[i],
            'alarmmsg_original': alarmmsgs[i],
            'site': sites[i],
            'root_cause_type': root_cause_types[i]
        }
        new_rows.append(new_row)

# 새로운 데이터프레임 생성
train_level_asc = pd.DataFrame(new_rows)

# 결과 출력
train_level_asc.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site,root_cause_type
0,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
1,14753084.0,7,ETHERNET-LINK-FAIL,ABNE,PowerFail
2,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ABNE,PowerFail
3,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,ABLB,PowerFail
4,14771766.0,7,ETHERNET-LINK-FAIL,ABLB,PowerFail


In [None]:
train_level_asc = train_level_asc.sort_values(['ticketno', 'alarmlevel', 'site', 'root_cause_type'],)
train_level_asc.head(10)

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site,root_cause_type
0,14753084.0,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
2,14753084.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ABNE,PowerFail
1,14753084.0,7,ETHERNET-LINK-FAIL,ABNE,PowerFail
3,14771766.0,4,POWER-SUPPLY-UNIT-FAIL,ABLB,PowerFail
4,14771766.0,7,ETHERNET-LINK-FAIL,ABLB,PowerFail
5,14777089.0,4,POWER-SUPPLY-UNIT-FAIL,ABLZ,PowerFail
6,14777089.0,7,ETHERNET-LINK-FAIL,ABLZ,PowerFail
7,14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
8,14879922.0,4,POWER-SUPPLY-UNIT-FAIL,ABNE,PowerFail
10,14879922.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ABNE,PowerFail


In [None]:
train_level_asc = pd.DataFrame(train_level_asc.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'site', 'root_cause_type']].agg(list))
train_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,"[4, 5, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T...","[ABNE, ABNE, ABNE]","[PowerFail, PowerFail, PowerFail]"
14771766.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABLB, ABLB]","[PowerFail, PowerFail]"
14777089.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABLZ, ABLZ]","[PowerFail, PowerFail]"
14790052.0,[4],[POWER-SUPPLY-UNIT-FAIL],[ACRX],[PowerFail]
14879922.0,"[4, 5, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T...","[ABNE, ABNE, ABNE]","[PowerFail, PowerFail, PowerFail]"


In [None]:
train_level_asc = pd.DataFrame(train_level_asc.applymap(lambda x: unpack_list(x)))

print(len(train_level_asc))
train_level_asc.head(10)

1114


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLB,ABLB","PowerFail,PowerFail"
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLZ,ABLZ","PowerFail,PowerFail"
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
14879922.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14901137.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE","PowerFail,PowerFail,PowerFail"
14919180.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABOJ,ABOJ","PowerFail,PowerFail"
14922559.0,445,"FAN-FAIL,48V-FAIL,48V-FAIL","AEIK,AEIK,AEIK","PowerFail,PowerFail,PowerFail"
14999487.0,77,"OPTICAL-REMOVE,OPTICAL-LOSS-OF-SIGNAL","ADWN,ADWN","UnitFail,UnitFail"
15036874.0,7,NON-VOLATILE-RANDOM-ACCESS-MEMORY-FAIL,ADTI,UnitFail


In [None]:
# 변경되는 값이 없으므로 root_cause_type을 한개의 값으로 변경
train_level_asc['root_cause_type'] = train_level_asc['root_cause_type'].apply(replace_root_cause_type)
train_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE",PowerFail
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLB,ABLB",PowerFail
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLZ,ABLZ",PowerFail
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
14879922.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE",PowerFail


## **test**

In [None]:
# ticketno 기준으로 중복된 행이 있으면 제일 처음꺼만 남기고 제거

col = ['ticketno', 'alarmlevel', 'alarmmsg_original', 'site']
test_last = test_divide[col].drop_duplicates(keep = 'last')
test_last.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site
7437,15238899.0,4,POWER-SUPPLY-UNIT-FAIL,ABMY
7439,15238899.0,7,ETHERNET-LINK-FAIL,ABMY
2770,15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
5965,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ACTL
6083,15723187.0,4,POWER-SUPPLY-UNIT-FAIL,ACTL


In [None]:
test_last = pd.DataFrame(test_last.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'site']].agg(list))
test_last.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15238899.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABMY, ABMY]"
15712444.0,[4],[POWER-SUPPLY-UNIT-FAIL],[AEOK]
15723187.0,"[5, 4]","[ETHERNET-NO-RECEIVE-TRAFFIC, POWER-SUPPLY-UNI...","[ACTL, ACTL]"
15737103.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ACQL, ACQL]"
15737132.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ACQL, ACQL]"


In [None]:
max_count_ticketno = max(test_last['alarmlevel'].apply(len))
max_count_ticketno

42

In [None]:
# site 변경 값이 없는지 확인
different_value_count(test_last, 'site')

447


In [None]:
# site가 다른 data
different_value_df(test_last, 'site')

Unnamed: 0,ticketno,site
0,18602743.0,"[AAPI, AAPI, AFIC, AAPI, AEYX, AAPI, AEYX, AAP..."
1,21819680.0,"[AEAZ, AEAA, AEAA, AEAZ, AEAA]"
2,21827851.0,"[ABFM, ADFL]"
3,21828572.0,"[AADN, AADO, AADN, AADO]"
4,21829232.0,"[AEPG, ADUO]"
...,...,...
442,22008868.0,"[ADKU, ADTE]"
443,22008913.0,"[ACAR, ACCK]"
444,22009895.0,"[ADDF, ADFE, ADFE, ADFE, ADFE, ADDF]"
445,22014653.0,"[AEBH, AEBH, AEBH, ACBY, ACBY]"


In [None]:
test_last = pd.DataFrame(test_last.applymap(lambda x: unpack_list(x)))

print(len(test_last))
test_last.head(10)

4327


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15238899.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABMY,ABMY"
15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
15723187.0,54,"ETHERNET-NO-RECEIVE-TRAFFIC,POWER-SUPPLY-UNIT-...","ACTL,ACTL"
15737103.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"
15737132.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"
16188560.0,4,BOOTING,AEHN
16199806.0,4,POWER-SUPPLY-UNIT-FAIL,ADFU
16201784.0,4755,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABNX,ABNX,ABNX,ABNX"
16203726.0,475,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL,ETHE...","ABOE,ABOE,ABOE"
16237988.0,7,OPTICAL-REMOVE,ABRV


In [None]:
test_last_reset = test_last.reset_index()

test_last_reset.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site
0,15238899.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABMY,ABMY"
1,15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
2,15723187.0,54,"ETHERNET-NO-RECEIVE-TRAFFIC,POWER-SUPPLY-UNIT-...","ACTL,ACTL"
3,15737103.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"
4,15737132.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"


In [None]:
# 새로운 데이터프레임 생성을 위한 빈 리스트
new_rows = []

# 데이터프레임 순회하며 ','로 분리된 값을 새로운 행으로 추가
for _, row in test_last_reset.iterrows():
    alarmlevels = row['alarmlevel'].split(',')
    alarmmsgs = row['alarmmsg_original'].split(',')
    sites = row['site'].split(',')


    # 분리된 값들을 새로운 행으로 추가
    for i in range(len(alarmlevels)):
        new_row = {
            'ticketno': row['ticketno'],
            'alarmlevel': alarmlevels[i],
            'alarmmsg_original': alarmmsgs[i],
            'site': sites[i]
        }
        new_rows.append(new_row)

# 새로운 데이터프레임 생성
test_level_asc = pd.DataFrame(new_rows)

# 결과 출력
test_level_asc.head()

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site
0,15238899.0,4,POWER-SUPPLY-UNIT-FAIL,ABMY
1,15238899.0,7,ETHERNET-LINK-FAIL,ABMY
2,15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
3,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ACTL
4,15723187.0,4,POWER-SUPPLY-UNIT-FAIL,ACTL


In [None]:
test_level_asc = test_level_asc.sort_values(['ticketno', 'alarmlevel', 'site'])
test_level_asc.head(10)

Unnamed: 0,ticketno,alarmlevel,alarmmsg_original,site
0,15238899.0,4,POWER-SUPPLY-UNIT-FAIL,ABMY
1,15238899.0,7,ETHERNET-LINK-FAIL,ABMY
2,15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
4,15723187.0,4,POWER-SUPPLY-UNIT-FAIL,ACTL
3,15723187.0,5,ETHERNET-NO-RECEIVE-TRAFFIC,ACTL
5,15737103.0,4,POWER-SUPPLY-UNIT-FAIL,ACQL
6,15737103.0,7,ETHERNET-LINK-FAIL,ACQL
7,15737132.0,4,POWER-SUPPLY-UNIT-FAIL,ACQL
8,15737132.0,7,ETHERNET-LINK-FAIL,ACQL
9,16188560.0,4,BOOTING,AEHN


In [None]:
test_level_asc = pd.DataFrame(test_level_asc.groupby(['ticketno'])[['alarmlevel', 'alarmmsg_original', 'site']].agg(list))
test_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15238899.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ABMY, ABMY]"
15712444.0,[4],[POWER-SUPPLY-UNIT-FAIL],[AEOK]
15723187.0,"[4, 5]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-NO-RECEIVE-T...","[ACTL, ACTL]"
15737103.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ACQL, ACQL]"
15737132.0,"[4, 7]","[POWER-SUPPLY-UNIT-FAIL, ETHERNET-LINK-FAIL]","[ACQL, ACQL]"


In [None]:
test_level_asc = pd.DataFrame(test_level_asc.applymap(lambda x: unpack_list(x)))

print(len(test_level_asc))
test_level_asc.head(10)

4327


Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15238899.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABMY,ABMY"
15712444.0,4,POWER-SUPPLY-UNIT-FAIL,AEOK
15723187.0,45,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ACTL,ACTL"
15737103.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"
15737132.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ACQL,ACQL"
16188560.0,4,BOOTING,AEHN
16199806.0,4,POWER-SUPPLY-UNIT-FAIL,ADFU
16201784.0,4557,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNX,ABNX,ABNX,ABNX"
16203726.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABOE,ABOE,ABOE"
16237988.0,7,OPTICAL-REMOVE,ABRV


# **preprocessing**

In [None]:
msg_train = train_level_asc['alarmmsg_original']
print(len(msg_train))

1114


ticketno
14753084.0    POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...
14771766.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
14777089.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
14790052.0                               POWER-SUPPLY-UNIT-FAIL
14879922.0    POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...
                                    ...                        
21792877.0                               OPTICAL-LOSS-OF-SIGNAL
21793984.0                               OPTICAL-LOSS-OF-SIGNAL
21799077.0    DATA-COMMUNICATE-CHANNEL-FAIL,PSEUDOWIRE-LOSS-...
21809789.0                               OPTICAL-LOSS-OF-SIGNAL
21811213.0                               OPTICAL-LOSS-OF-SIGNAL
Name: alarmmsg_original, Length: 1114, dtype: object

In [None]:
# 문장에서 단어를 분리할 때 사용한 ','를 공백으로 대체
msg_train = [x.replace(',', ' ') for x in msg_train]
len(msg_train)

1114

In [None]:
msg_test = test_level_asc['alarmmsg_original']
print(len(msg_test))

4327


ticketno
15238899.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
15712444.0                               POWER-SUPPLY-UNIT-FAIL
15723187.0    POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...
15737103.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
15737132.0            POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL
                                    ...                        
22015278.0                             ALARM-RECEIVE-POWER-HIGH
22015300.0                                       LOSS-OF-SIGNAL
23818326.0      ETHERNET-FAIL,SERVER-SIGNAL-FAIL,LOSS-OF-SIGNAL
23819373.0                                   SERVER-SIGNAL-FAIL
26067480.0                                      LOSS-OF-CONNECT
Name: alarmmsg_original, Length: 4327, dtype: object

In [None]:
# 문장에서 단어를 분리할 때 사용한 ','를 공백으로 대체
msg_test = [x.replace(',', ' ') for x in msg_test]
len(msg_test)

4327

# **train**

In [None]:
train_level_asc.head()

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE",PowerFail
14771766.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLB,ABLB",PowerFail
14777089.0,47,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-LINK-FAIL","ABLZ,ABLZ",PowerFail
14790052.0,4,POWER-SUPPLY-UNIT-FAIL,ACRX,PowerFail
14879922.0,457,"POWER-SUPPLY-UNIT-FAIL,ETHERNET-NO-RECEIVE-TRA...","ABNE,ABNE,ABNE",PowerFail


In [None]:
# '-'와 ',' 제거 함수 정의
def remove_chars(text):
    text = text.replace('-', ' ').replace(',', ' ')
    return text

# apply 함수를 사용하여 컬럼의 값에 함수 적용
train_level_asc['alarmmsg_original'] = train_level_asc['alarmmsg_original'].apply(remove_chars)

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,457,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...,"ABNE,ABNE,ABNE",PowerFail
14771766.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ABLB,ABLB",PowerFail
14777089.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ABLZ,ABLZ",PowerFail
14790052.0,4,POWER SUPPLY UNIT FAIL,ACRX,PowerFail
14879922.0,457,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...,"ABNE,ABNE,ABNE",PowerFail
...,...,...,...,...
21792877.0,7,OPTICAL LOSS OF SIGNAL,ABZD,LinkCut
21793984.0,7,OPTICAL LOSS OF SIGNAL,ADOK,LinkCut
21799077.0,557,DATA COMMUNICATE CHANNEL FAIL PSEUDOWIRE LOSS ...,"ABUY,ABUY,ABUY",LinkCut
21809789.0,7,OPTICAL LOSS OF SIGNAL,ADKA,LinkCut


In [None]:
# fasttext에 input 데이터의 형식을 지키기 위해 변환 필요
# 레이블 형식으로 변환하는 함수 정의
def convert_to_label(row):
    return f'__label__{row}'

# 'root_cause_type' 컬럼 값 변환
train_level_asc['root_cause_type'] = train_level_asc['root_cause_type'].apply(convert_to_label)

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site,root_cause_type
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
14753084.0,457,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...,"ABNE,ABNE,ABNE",__label__PowerFail
14771766.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ABLB,ABLB",__label__PowerFail
14777089.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ABLZ,ABLZ",__label__PowerFail
14790052.0,4,POWER SUPPLY UNIT FAIL,ACRX,__label__PowerFail
14879922.0,457,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...,"ABNE,ABNE,ABNE",__label__PowerFail
...,...,...,...,...
21792877.0,7,OPTICAL LOSS OF SIGNAL,ABZD,__label__LinkCut
21793984.0,7,OPTICAL LOSS OF SIGNAL,ADOK,__label__LinkCut
21799077.0,557,DATA COMMUNICATE CHANNEL FAIL PSEUDOWIRE LOSS ...,"ABUY,ABUY,ABUY",__label__LinkCut
21809789.0,7,OPTICAL LOSS OF SIGNAL,ADKA,__label__LinkCut


In [None]:
col = ['root_cause_type', 'alarmmsg_original']
input_train_ft = train_level_asc[col]      # input train data
input_train_ft.head()

Unnamed: 0_level_0,root_cause_type,alarmmsg_original
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1
14753084.0,__label__PowerFail,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...
14771766.0,__label__PowerFail,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL
14777089.0,__label__PowerFail,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL
14790052.0,__label__PowerFail,POWER SUPPLY UNIT FAIL
14879922.0,__label__PowerFail,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...


In [None]:
# txt 파일로 저장 (fasttex에 넣으려면 txt 형식 필요)
output_file_path = 'input_train_ft.txt'
input_train_ft.to_csv(output_file_path, sep = '\t', index = False, header = False)

# **test**

In [None]:
# '-'와 ',' 제거 함수 정의
def remove_chars(text):
    text = text.replace('-', ' ').replace(',', ' ')
    return text

# apply 함수를 사용하여 컬럼의 값에 함수 적용
test_level_asc['alarmmsg_original'] = test_level_asc['alarmmsg_original'].apply(remove_chars)

Unnamed: 0_level_0,alarmlevel,alarmmsg_original,site
ticketno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15238899.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ABMY,ABMY"
15712444.0,4,POWER SUPPLY UNIT FAIL,AEOK
15723187.0,45,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...,"ACTL,ACTL"
15737103.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ACQL,ACQL"
15737132.0,47,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL,"ACQL,ACQL"
...,...,...,...
22015278.0,4,ALARM RECEIVE POWER HIGH,AEMD
22015300.0,5,LOSS OF SIGNAL,AEAQ
23818326.0,455,ETHERNET FAIL SERVER SIGNAL FAIL LOSS OF SIGNAL,"AEKU,AEKU,AEKU"
23819373.0,5,SERVER SIGNAL FAIL,AEKU


In [None]:
col = ['alarmmsg_original']
output_test_last = test_level_asc[col]
output_test_last.head()

Unnamed: 0_level_0,alarmmsg_original
ticketno,Unnamed: 1_level_1
15238899.0,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL
15712444.0,POWER SUPPLY UNIT FAIL
15723187.0,POWER SUPPLY UNIT FAIL ETHERNET NO RECEIVE TRA...
15737103.0,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL
15737132.0,POWER SUPPLY UNIT FAIL ETHERNET LINK FAIL


In [None]:
# txt 파일로 저장
output_file_path = 'output_test_ft.txt'
output_test_last.to_csv(output_file_path, sep = '\t', index = False, header = False)

# **fasttext - 모델 학습**

In [None]:
import fasttext

# 훈련 데이터 파일 경로
train_data_path = '/content/input_train_ft.txt'

# fasttext.train_supervised() 함수로 텍스트 분류 모델 훈련
model = fasttext.train_supervised(input = train_data_path, epoch = 100, lr = 1.0, wordNgrams = 2)

print(model)

# **fasttext - 모델 훈련**

In [None]:
import fasttext


# 테스트 데이터 파일 경로
test_data_path = '/content/output_test_ft.txt'

# 테스트 데이터 읽기
with open(test_data_path, 'r', encoding='utf-8') as test_file:
    lines = test_file.readlines()

# 예측 수행 및 결과 저장
predictions = []
for line in lines:
    line = line.strip()  # 줄바꿈 문자 제거
    pred_label = model.predict(line)[0][0]

#    pred_label = model.predict((line), k=3) # proba 뽑을때
    predictions.append(pred_label)

# 예측 결과 출력
for idx, pred_label in enumerate(predictions):
    print(f"Test Sample {idx+1} Predicted Label: {pred_label}")

Test Sample 1 Predicted Label: __label__PowerFail
Test Sample 2 Predicted Label: __label__PowerFail
Test Sample 3 Predicted Label: __label__PowerFail
Test Sample 4 Predicted Label: __label__PowerFail
Test Sample 5 Predicted Label: __label__PowerFail
Test Sample 6 Predicted Label: __label__UnitFail
Test Sample 7 Predicted Label: __label__PowerFail
Test Sample 8 Predicted Label: __label__PowerFail
Test Sample 9 Predicted Label: __label__PowerFail
Test Sample 10 Predicted Label: __label__UnitFail
Test Sample 11 Predicted Label: __label__PowerFail
Test Sample 12 Predicted Label: __label__UnitFail
Test Sample 13 Predicted Label: __label__UnitFail
Test Sample 14 Predicted Label: __label__PowerFail
Test Sample 15 Predicted Label: __label__PowerFail
Test Sample 16 Predicted Label: __label__UnitFail
Test Sample 17 Predicted Label: __label__UnitFail
Test Sample 18 Predicted Label: __label__UnitFail
Test Sample 19 Predicted Label: __label__UnitFail
Test Sample 20 Predicted Label: __label__UnitFai

In [None]:
# 예측 결과를 데이터프레임으로 저장
pred_label = pd.DataFrame({"Test Sample": range(1, len(predictions) + 1)})
pred_label["Predicted Label"] = predictions  # 'Predicted Label' 열에 예측 결과 추가

Unnamed: 0,Test Sample,Predicted Label
0,1,__label__PowerFail
1,2,__label__PowerFail
2,3,__label__PowerFail
3,4,__label__PowerFail
4,5,__label__PowerFail
...,...,...
4322,4323,__label__LinkCut
4323,4324,__label__LinkCut
4324,4325,__label__LinkCut
4325,4326,__label__LinkCut


In [None]:
# 'Predicted Label' 컬럼 값 변환
pred_label['Predicted Label'] = pred_label['Predicted Label'].apply(lambda x: x.replace('__label__', ''))
pred_label.head()

Unnamed: 0,Test Sample,Predicted Label
0,1,PowerFail
1,2,PowerFail
2,3,PowerFail
3,4,PowerFail
4,5,PowerFail


In [None]:
label_sample = label_sample_org.copy()

In [None]:
# 원래 데이터프레임에 새로운 데이터 추가
label_sample['root_cause_type'] = pred_label['Predicted Label']

# 결과 출력
label_sample.head()

Unnamed: 0,ticketno,root_cause_type
0,15238899.0,PowerFail
1,15712444.0,PowerFail
2,15723187.0,PowerFail
3,15737103.0,PowerFail
4,15737132.0,PowerFail


In [None]:
# 타겟 비율 확인
label_sample['root_cause_type'].value_counts()/len(label_sample['root_cause_type'])

LinkCut      0.658655
PowerFail    0.308990
UnitFail     0.032355
Name: root_cause_type, dtype: float64