###**Data Generation**

In [None]:
import pandas as pd
import numpy as np

#np.random.seed(3)

rng = np.random.default_rng(3)
normal_data_count = 3001
num_devices = 1000
num_stations = 100
num_groups = 4
max_stations_per_device = 30


# Function to generate synthetic data with all anomaly types (#Use normal_data_count = 200 (default case), Use normal_data_count = 3001 for making anomaly a minority)
def modified_generate_synthetic_data_with_anomalies(rng,num_devices, num_stations, num_groups, max_stations_per_device,normal_data_count):
    device_ids = [f'Device-{i+1}' for i in range(num_devices)]
    station_ids = [f'Station-{i+1}' for i in range(num_stations)]

    data = []
    anomalies = {}

    for device_id in device_ids:
        group = rng.integers(1, num_groups + 1)

        # Define common sequence for each group
        if 'group_stations' not in locals():
            group_stations = {group_num: rng.choice(station_ids, size=max_stations_per_device, replace=False).tolist() for group_num in range(1, num_groups + 1)}

        common_stations = group_stations[group]

        # # Add 20% more devices without any anomaly
        # if np.random.rand() < 0.2:
        #     data.append([device_id, group, common_stations])

        # Anomalous Sequence 1: Different order in 20% devices
        if rng.random() < 0.2:
            anomalous_sequence = rng.choice(common_stations, size=max_stations_per_device, replace=False).tolist()
            anomalies[device_id] = ("Different Order", anomalous_sequence)

        # Anomalous Sequence 2: Add 2 more stations for 10% devices
        elif rng.random() < 0.1:
            additional_stations = rng.choice(station_ids, size=2, replace=False).tolist()
            insertion_index = rng.integers(0, len(common_stations) + 1)
            anomalous_sequence = common_stations[:insertion_index] + additional_stations + common_stations[insertion_index:]
            anomalies[device_id] = ("Added Stations", anomalous_sequence)

        # Anomalous Sequence 3: Remove 2 stations for 10% devices
        elif rng.random() < 0.1:
            anomalous_sequence = rng.choice(common_stations, size=max_stations_per_device - 2, replace=False).tolist()
            anomalies[device_id] = ("Removed Stations", anomalous_sequence)

        # Anomalous Sequence 4: Subsequence moved to a different location for 20% devices
        elif rng.random() < 0.2:
            subsequence_length = max_stations_per_device // 5
            subsequence_start_idx = rng.integers(0, len(common_stations) - subsequence_length + 1)
            subsequence_end_idx = subsequence_start_idx + subsequence_length
            subsequence = common_stations[subsequence_start_idx:subsequence_end_idx]

            common_stations_without_subsequence = [station for station in common_stations if station not in subsequence]

            insertion_index = rng.integers(0, len(common_stations_without_subsequence) + 1)
            anomalous_sequence = common_stations_without_subsequence[:insertion_index] + subsequence + common_stations_without_subsequence[insertion_index:]
            anomalies[device_id] = ("Moved Subsequence", anomalous_sequence)


        # Anomalous Sequence 5: Subsequence repeated exactly twice for 20% devices
        # elif np.random.rand() < 0.2:
        #     subsequence_length = max_stations_per_device // 5
        #     subsequence_start_idx = np.random.randint(0, max_stations_per_device - subsequence_length + 1)
        #     subsequence = common_stations[subsequence_start_idx:subsequence_start_idx + subsequence_length]

        #     # Choose an arbitrary location to add the subsequence
        #     insertion_idx = np.random.randint(0, max_stations_per_device + 1)

        #     # Create the anomalous sequence with the subsequence repeated exactly twice
        #     anomalous_sequence = (
        #         common_stations[:insertion_idx]
        #         + subsequence
        #         + common_stations[insertion_idx:]
        #         + subsequence
        #         + common_stations[insertion_idx + subsequence_length:]
        #     )

        #     anomalies[device_id] = ("Repeated Subsequence", anomalous_sequence)

        # Anomalous Sequence 5: Subsequence repeated exactly once for 20% devices
        elif rng.random() < 0.2:
            subsequence_length = max_stations_per_device // 5
            subsequence_start_idx = rng.integers(0, max_stations_per_device - subsequence_length + 1)
            subsequence = common_stations[subsequence_start_idx:subsequence_start_idx + subsequence_length]

            # Choose an arbitrary location to add the subsequence
            insertion_idx = rng.integers(0, max_stations_per_device + 1)

            # Create the anomalous sequence with the subsequence repeated exactly once
            anomalous_sequence = (
                common_stations[:insertion_idx]
                + subsequence
                + common_stations[insertion_idx:]
                # + subsequence
                # + common_stations[insertion_idx + subsequence_length:]
            )

            anomalies[device_id] = ("Repeated Subsequence", anomalous_sequence)


        # Anomalous Sequence 6: One station repeated for 20% devices
        else:
            # Choose a random station_id from the common sequence
            repeated_station = rng.choice(common_stations)

            # Choose an arbitrary location to insert the repeated station
            insertion_idx = rng.integers(0, max_stations_per_device + 1)

            # Create the anomalous sequence with the station repeated
            anomalous_sequence = (
                common_stations[:insertion_idx]
                + [repeated_station]
                + common_stations[insertion_idx:]
            )

            anomalies[device_id] = ("Repeated Station", anomalous_sequence)

        data.append([device_id, group, anomalies[device_id][1]])

    # Add normal new devices without any anomaly
    for i in range(1, normal_data_count):
        group_num = i % num_groups + 1
        device_id = f'Device-{num_devices + i}'
        common_stations = group_stations[group_num]
        data.append([device_id, group_num, common_stations])

    # Randomize the order of devices
    rng.shuffle(data)

    columns = ['Device_ID', 'Group', 'Station_Sequence']

    # Label devices with no anomaly as "No anomaly"
    df = pd.DataFrame(data, columns=columns)


    print("Normal Station Sequences for Each Group:")
    for group_num, common_sequence in group_stations.items():
        print(f"Group {group_num}: {common_sequence}")

    return df, anomalies

In [None]:
# import pandas as pd
# import numpy as np

# Generate synthetic data with all anomalies
rng = np.random.default_rng(23)  #3 #4(good for 201) #(6 good for 1001 somewhat okay for 2001) #23 (30/11/23 last)
normal_data_count = 2001
#normal_data_count = 3001
num_devices = 1000
num_stations = 100
num_groups = 4
max_stations_per_device = 30
df, anomalies = modified_generate_synthetic_data_with_anomalies(rng = rng,num_devices=num_devices, num_stations=num_stations, num_groups=num_groups, max_stations_per_device=max_stations_per_device,normal_data_count = normal_data_count)

# Convert the station sequence into a text representation
df['Station_Text'] = df['Station_Sequence'].apply(lambda x: ' '.join(x))

# Convert the text representation of station sequence to just sequence of station numbers
df['Station_No'] = df['Station_Sequence'].apply(lambda x: ' '.join(x).replace('Station-',''))


df['Anomaly_Type'] = df['Device_ID'].map(lambda x: anomalies[x][0] if x in anomalies else 'Normal')

display(df)
print(df.shape)

Normal Station Sequences for Each Group:
Group 1: ['Station-92', 'Station-30', 'Station-78', 'Station-47', 'Station-53', 'Station-15', 'Station-70', 'Station-38', 'Station-17', 'Station-62', 'Station-42', 'Station-84', 'Station-33', 'Station-96', 'Station-46', 'Station-45', 'Station-10', 'Station-2', 'Station-76', 'Station-50', 'Station-9', 'Station-63', 'Station-7', 'Station-19', 'Station-31', 'Station-68', 'Station-52', 'Station-86', 'Station-20', 'Station-36']
Group 2: ['Station-69', 'Station-41', 'Station-49', 'Station-47', 'Station-14', 'Station-84', 'Station-48', 'Station-38', 'Station-95', 'Station-68', 'Station-67', 'Station-25', 'Station-4', 'Station-86', 'Station-85', 'Station-70', 'Station-66', 'Station-89', 'Station-8', 'Station-20', 'Station-33', 'Station-83', 'Station-94', 'Station-12', 'Station-40', 'Station-43', 'Station-82', 'Station-1', 'Station-63', 'Station-61']
Group 3: ['Station-97', 'Station-72', 'Station-41', 'Station-13', 'Station-98', 'Station-20', 'Station-30

Unnamed: 0,Device_ID,Group,Station_Sequence,Station_Text,Station_No,Anomaly_Type
0,Device-1416,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
1,Device-789,1,"[Station-92, Station-36, Station-30, Station-7...",Station-92 Station-36 Station-30 Station-78 St...,92 36 30 78 47 53 15 70 38 17 62 42 84 33 96 4...,Repeated Station
2,Device-1516,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-797,2,"[Station-82, Station-83, Station-38, Station-3...",Station-82 Station-83 Station-38 Station-33 St...,82 83 38 33 14 49 66 8 94 40 63 85 43 70 86 47...,Removed Stations
4,Device-1392,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
...,...,...,...,...,...,...
2995,Device-1273,2,"[Station-69, Station-41, Station-49, Station-4...",Station-69 Station-41 Station-49 Station-47 St...,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2996,Device-1786,3,"[Station-97, Station-72, Station-41, Station-1...",Station-97 Station-72 Station-41 Station-13 St...,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
2997,Device-538,3,"[Station-97, Station-72, Station-41, Station-1...",Station-97 Station-72 Station-41 Station-13 St...,97 72 41 13 98 20 36 30 47 78 44 27 74 1 92 33...,Repeated Station
2998,Device-2235,4,"[Station-46, Station-97, Station-100, Station-...",Station-46 Station-97 Station-100 Station-60 S...,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal


(3000, 6)


In [None]:
df

Unnamed: 0,Device_ID,Group,Station_Sequence,Station_Text,Station_No,Anomaly_Type
0,Device-1416,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
1,Device-789,1,"[Station-92, Station-36, Station-30, Station-7...",Station-92 Station-36 Station-30 Station-78 St...,92 36 30 78 47 53 15 70 38 17 62 42 84 33 96 4...,Repeated Station
2,Device-1516,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-797,2,"[Station-82, Station-83, Station-38, Station-3...",Station-82 Station-83 Station-38 Station-33 St...,82 83 38 33 14 49 66 8 94 40 63 85 43 70 86 47...,Removed Stations
4,Device-1392,1,"[Station-92, Station-30, Station-78, Station-4...",Station-92 Station-30 Station-78 Station-47 St...,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
...,...,...,...,...,...,...
2995,Device-1273,2,"[Station-69, Station-41, Station-49, Station-4...",Station-69 Station-41 Station-49 Station-47 St...,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2996,Device-1786,3,"[Station-97, Station-72, Station-41, Station-1...",Station-97 Station-72 Station-41 Station-13 St...,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
2997,Device-538,3,"[Station-97, Station-72, Station-41, Station-1...",Station-97 Station-72 Station-41 Station-13 St...,97 72 41 13 98 20 36 30 47 78 44 27 74 1 92 33...,Repeated Station
2998,Device-2235,4,"[Station-46, Station-97, Station-100, Station-...",Station-46 Station-97 Station-100 Station-60 S...,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal


# Implementation

Create the following:
* Event Set (E), Necesasry Event Set(En), Loop Pattern set(LP), and perform sequence processing subroutine on LP set to get the final Loop Pattern.

In [None]:
complete_data = df.loc[:, ['Device_ID', 'Station_No', 'Anomaly_Type']]
complete_data

Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-1416,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
1,Device-789,92 36 30 78 47 53 15 70 38 17 62 42 84 33 96 4...,Repeated Station
2,Device-1516,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-797,82 83 38 33 14 49 66 8 94 40 63 85 43 70 86 47...,Removed Stations
4,Device-1392,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
...,...,...,...
2995,Device-1273,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2996,Device-1786,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
2997,Device-538,97 72 41 13 98 20 36 30 47 78 44 27 74 1 92 33...,Repeated Station
2998,Device-2235,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(complete_data.loc[:, ['Device_ID', 'Station_No']], complete_data.loc[:, ['Anomaly_Type']], test_size=0.30, random_state=42)

complete_train_data = pd.concat([X_train, y_train], axis=1)
complete_test_data = pd.concat([X_test,y_test], axis = 1)

complete_test_data = complete_test_data.reset_index(drop=True)
complete_train_data = complete_train_data.reset_index(drop=True)

In [None]:
train_data = complete_train_data
train_data

Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-1315,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal
1,Device-2534,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
2,Device-1320,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-655,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Repeated Station
4,Device-1129,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
...,...,...,...
2095,Device-1521,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2096,Device-2321,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2097,Device-1525,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2098,Device-2771,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal


filter the training data by removing the classes which are not of type 'Normal'

In [None]:
data = train_data[train_data['Anomaly_Type'] == 'Normal']
data = data.reset_index(drop=True)
data

Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-1315,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal
1,Device-2534,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
2,Device-1320,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-1129,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
4,Device-2328,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
...,...,...,...
1401,Device-1414,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
1402,Device-1521,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
1403,Device-2321,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
1404,Device-1525,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal


In [None]:
E = []
En = []

Creating the Event Set and Necessary Event Set

In [None]:
for i in range(len(data)):
  for event in data['Station_No'][i].split(' '):
    if event not in E:
      E.append(event)
K = len(E) # Cardinatlity of Event Set E

for event in E:
  flag = 1
  for i in range(len(data)):
    if event not in data['Station_No'][i].split(' '):
      flag = -1

  if flag == 1:
    En.append(event)

In [None]:
print(E, len(E))

['46', '97', '100', '60', '84', '8', '49', '65', '13', '77', '5', '7', '36', '55', '69', '40', '88', '66', '83', '24', '76', '14', '4', '72', '30', '79', '86', '6', '41', '95', '98', '20', '47', '78', '44', '27', '74', '1', '92', '33', '63', '32', '42', '64', '56', '28', '26', '12', '10', '90', '53', '15', '70', '38', '17', '62', '96', '45', '2', '50', '9', '19', '31', '68', '52', '48', '67', '25', '85', '89', '94', '43', '82', '61'] 74


### Subroutine 1

In [None]:
def iterate_substrings(stations):
  n = len(stations)
  substations = []

  for length in range(2, n + 1):
      for i in range(n - length + 1):
          subsequence = stations[i:i + length]
          # print(subsequence)  # Optional: You can print the subsequences if needed
          substations.append(subsequence)

  return substations

In [None]:
def frequency_of_subseq(di, D): # di is the subsequence and D is the main sequence
  n = len(D)
  target_length = len(di)
  count = 0

  for i in range(n - target_length + 1):
      subsequence = D[i:i + target_length]
      if subsequence == di:
          count += 1

  return count

In [None]:
# Candidate Pattern Extracting
Map = {}
for i in range(len(data)):
  substring_set = iterate_substrings(data['Station_No'][i].split(' '))
  for sub_seq in substring_set:
    num = frequency_of_subseq(sub_seq, data['Station_No'][i].split(' '))
    sub_seq_str = ' '.join(sub_seq) # Combining the array into a single string to store it inside a dictionary
    if num > 1 and sub_seq_str not in Map:
      Map[sub_seq_str] = 0
      Map[sub_seq_str] = Map[sub_seq_str] + len(sub_seq)*num # Calculating the importance of a sequence

C = sorted(Map.items(), key=lambda x:x[1])
print(C, len(C)) # Ordered Candidate Pattern Set

[] 0


### Subroutine 2

In [None]:
# Pattern Learning Subroutine
LP = [] # Loop Pattern Set
for i in range(len(C)):
  if C[i] != -1:
    for j in range(i+1, len(C)):
      if frequency_of_subseq(C[i], C[j]) != 0: # if C[i] is a subset of C[j]
        C[j] = -1
    if C[i] != -1:
      LP.append(C[i])
LP_dict = {item[0]: item[1] for item in LP}  # Convertin LP (List) to LP_dict(Dictionary) for simpler deletion in subroutine 2 (2nd Part)
print(LP_dict, len(LP_dict))

{} 0


### Subroutine 3

In [None]:
def change_subseq_if_exists(di, D): # di is the subsequence and D is the main sequence
  n = len(D)
  target_length = len(di)
  count = 0

  for i in range(n - target_length + 1):
      subsequence = D[i:i + target_length]
      if subsequence == di:
        for j in range(len(subsequence)): # Removing the subsequence from the original sequence
          D.pop(i)
        D.insert(i, str(di))

  return D

In [None]:
def replace_pattern(LP, sigma): # LP is a tuple here with key and importance as (key, imporatnce) and sigma is the main sequence
  if len(LP[0].split(' ')) > 1:
    sigma = change_subseq_if_exists(LP[0].split(' '), sigma)
  return sigma

In [None]:
# Sequence Processing Subroutine
def sequence_processing_subroutine(sig, S, E):
  if len(S) > 0:
    sigma = [[]]*len(S)
    sigma[0] = sig
    SL = []
    for i in range(1, len(S)):
      sigma[i] = replace_pattern(S[i], sigma[i-1])

    return sigma[len(S)-1]
  else:
    return sig

### Part 2 of Subroutine 2


In [None]:
def is_sequential(lpi, SL): # lpi is the pattern in LP which is used to check if its repeated consequently in the split list SL.
  is_repeated_consecutively = any(SL[i:i+len(lpi)] == lpi for i in range(len(SL)-1)) # checks if any of the subsequences in SL are same as lpi
  return is_repeated_consecutively

In [None]:
for i in range(len(data)):
  SL = sequence_processing_subroutine(data['Station_No'][i].split(' '), LP, E)
  for j in range(len(LP)):
    lpj = LP[j][0].split(' ')
    if is_sequential(lpj, SL) != True: # if lpj is NOT repeated consequently in SL delete lpj from LP.
      if LP_dict.get(' '.join(lpj)) != None:
        del LP_dict[' '.join(lpj)]
print(LP_dict)

{}


In [None]:
LP_list = list(LP_dict.keys())
LP_list

[]

### Subroutine 4

In [None]:
# Build a Relation Table
P = list(set(E).union(set(LP_list))) # First Define a Pattern Set which is the union of the loop pattern set(LP_list) and the basic event set(E).
T = [[None]*len(P)]*len(P)
df_T = pd.DataFrame(T, columns=P, index=P) # Initialize the column names and row names with the events in the Pattern Set
# Now build a pattern Table T
# Pattern Table Building Subroutine

for i in range(len(data)):
  SL = sequence_processing_subroutine(data['Station_No'][i].split(' '), P, E)  # SL contains the elements in ' data['Station_No'][i].split(' ') ' this sequence
  for j in range(1, len(SL)):
    if SL[j-1] == SL[j]:
      df_T[SL[j-1]][SL[j]] = '*'

    if df_T[SL[j-1]][SL[j]] == None or df_T[SL[j-1]][SL[j]] == '->' and df_T[SL[j]][SL[j-1]] == None:
      df_T[SL[j-1]][SL[j]] = '->'

    if df_T[SL[j-1]][SL[j]] == None or df_T[SL[j-1]][SL[j]] == '->' and df_T[SL[j]][SL[j-1]] == '->':
      df_T[SL[j-1]][SL[j]] == '||'
      df_T[SL[j]][SL[j-1]] == '||'

print(df_T)

       7    85    62    10    97    44    92     5    46    26  ...    95  \
7   None  None  None  None  None  None  None    ->  None  None  ...  None   
85  None  None  None  None  None  None  None  None  None  None  ...  None   
62  None  None  None  None  None  None  None  None  None  None  ...  None   
10  None  None  None  None  None  None  None  None  None  None  ...  None   
97  None  None  None  None  None  None  None  None    ->  None  ...  None   
..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
79  None  None  None  None  None  None  None  None  None  None  ...  None   
12  None  None  None  None  None  None  None  None  None    ->  ...  None   
2   None  None  None    ->  None  None  None  None  None  None  ...  None   
83  None  None  None  None  None  None  None  None  None  None  ...  None   
30  None  None  None  None  None  None    ->  None  None  None  ...  None   

      98    45     1    24    79    12     2    83    30  
7   None  None  

### Subroutine 5

In [None]:
# Anomaly Detection of any sequence sigma
def Anomaly_Detection(sigma, LP, E):
  E_cap = []

  for event in sigma:
    if event not in E_cap:
      E_cap.append(event)

  if set(En).issubset(set(E_cap)) == False or set(E_cap).issubset(set(E)) == False:
    return 'Abnormal'

  SL = sequence_processing_subroutine(sigma, P, E)
  for i in range(1, len(SL)):
    if df_T[SL[i-1]][SL[i]] == None:
      return 'Abnormal'

  return 'Normal'

In [None]:
test_data = complete_test_data
test_data

Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-2759,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal
1,Device-1517,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2,Device-2232,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-1614,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
4,Device-80,76 13 78 69 98 12 27 56 63 26 36 30 32 28 41 4...,Different Order
...,...,...,...
895,Device-2440,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
896,Device-2041,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
897,Device-1276,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
898,Device-817,97 72 41 13 98 20 30 47 78 44 27 74 63 1 92 33...,Repeated Station


In [None]:
new_data = test_data[['Device_ID', 'Station_No']].copy()
new_data['Anomaly_Type'] = np.nan

for i in range(len(new_data)):
  new_data['Anomaly_Type'][i] = Anomaly_Detection(new_data['Station_No'][i].split(' '), LP_list, E)

new_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['Anomaly_Type'][i] = Anomaly_Detection(new_data['Station_No'][i].split(' '), LP_list, E)


Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-2759,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal
1,Device-1517,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2,Device-2232,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-1614,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
4,Device-80,76 13 78 69 98 12 27 56 63 26 36 30 32 28 41 4...,Abnormal
...,...,...,...
895,Device-2440,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
896,Device-2041,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
897,Device-1276,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
898,Device-817,97 72 41 13 98 20 30 47 78 44 27 74 63 1 92 33...,Abnormal


In [None]:
for i in range(len(test_data)):
  if test_data['Anomaly_Type'][i] != 'Normal':
    test_data['Anomaly_Type'][i] = 'Abnormal'
test_data

Unnamed: 0,Device_ID,Station_No,Anomaly_Type
0,Device-2759,46 97 100 60 84 8 49 65 13 77 5 7 36 55 69 40 ...,Normal
1,Device-1517,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
2,Device-2232,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
3,Device-1614,97 72 41 13 98 20 30 47 78 44 27 74 1 92 33 63...,Normal
4,Device-80,76 13 78 69 98 12 27 56 63 26 36 30 32 28 41 4...,Abnormal
...,...,...,...
895,Device-2440,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
896,Device-2041,69 41 49 47 14 84 48 38 95 68 67 25 4 86 85 70...,Normal
897,Device-1276,92 30 78 47 53 15 70 38 17 62 42 84 33 96 46 4...,Normal
898,Device-817,97 72 41 13 98 20 30 47 78 44 27 74 63 1 92 33...,Abnormal


In [None]:
true_labels = test_data['Anomaly_Type']
predicted_labels = new_data['Anomaly_Type']

In [None]:
count = 0
for i in range(len(true_labels)):
  if true_labels[i] == predicted_labels[i]:
    count += 1
    # print(true_labels[i], predicted_labels[i])
print(count)

899


In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 99.89%
