In [26]:
import pandas as pd
from difflib import SequenceMatcher
import itertools
import re
import plotly.express as px
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
columns = ['ticket_no','title','type','configuration','level']

data = [
  [1, "Server: ctil-sqlssis02.ctil.local Disk Free Space: F:", "Space Issue","ctil-sqlssis02.ctil.local","parent"], 
  [2, "Server: UKCPPBNODE4.rws.com Disk Free Space: G:", "Space Issue","cUKCPPBNODE4.rws.com","parent"], 
  [3, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "","LDSQLMID01.rimes.dir","child"], 
  [4, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","UKCPPBNODE4.rws.com","parent"], 
  [5, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","ctil-sqlssis01.ctil.local","parent"], 
  [6, "RCA: BSS-DES-SQL07\BSSERP  Availability Group [AlwaysOn replica is not in the PRIMARY or SECONDARY]", "HA-DR","BSS-DES-SQL07\BSSERP","parent"], 
  [7, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","","child"], 
  [8, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","ctil-sqlssis02.ctil.local","parent"], 
  [9, "PagerDuty: Server: BSS-DEP-SQL07\BSSERP AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","","parent"], 
  [10, "Server: LDSQLMID01.rimes.dir Disk Free Space: F:", "Space Issue","LDSQLMID01.rimes.dir","parent"],   
  [11, "Server: TGSQLCLUST02.trakglobal.local Disk Free Space: Cluster Disk 16_K:", "Space Issue","TGSQLCLUST02.trakglobal.local","parent"], 
  [12, "Server: SOUSQ01L.ITSLAW.CO.UK Disk Free Space: G:", "Space Issue","SOUSQ01L.ITSLAW.CO.UK","parent"], 
  [13, "RCA - AlwaysOn replica is not in the PRIMARY or SECONDARY role on  PUBLICSQLF", "Service Failure","","parent"], 
  [14, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "Service Failure","","parent"], 
  [15, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","","child"], 
  [16, "PagerDuty: Server: LDSQLMID01.rimes.dir AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "","LDSQLMID01.rimes.dir","parent"], 
  [17, "PagerDuty: Server:  AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","","parent"], 
  [18, "PagerDuty: Server: UKCPPBNODE4.rws.com AlwaysOn replica is not in the PRIMARY or SECONDARY role - custom", "HA-DR","UKCPPBNODE4.rws.com ","parent"], 
]

In [3]:
df = pd.DataFrame(data=data, columns=columns)
df.set_index('ticket_no',inplace=True)
df.title = df.title.str.lower()
df

Unnamed: 0_level_0,title,type,configuration,level
ticket_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,server: ctil-sqlssis02.ctil.local disk free sp...,Space Issue,ctil-sqlssis02.ctil.local,parent
2,server: ukcppbnode4.rws.com disk free space: g:,Space Issue,cUKCPPBNODE4.rws.com,parent
3,pagerduty: server: alwayson replica is not in...,,LDSQLMID01.rimes.dir,child
4,pagerduty: server: alwayson replica is not in...,HA-DR,UKCPPBNODE4.rws.com,parent
5,pagerduty: server: alwayson replica is not in...,HA-DR,ctil-sqlssis01.ctil.local,parent
6,rca: bss-des-sql07\bsserp availability group ...,HA-DR,BSS-DES-SQL07\BSSERP,parent
7,pagerduty: server: alwayson replica is not in...,HA-DR,,child
8,pagerduty: server: alwayson replica is not in...,HA-DR,ctil-sqlssis02.ctil.local,parent
9,pagerduty: server: bss-dep-sql07\bsserp always...,HA-DR,,parent
10,server: ldsqlmid01.rimes.dir disk free space: f:,Space Issue,LDSQLMID01.rimes.dir,parent


In [4]:
# DataFrame of only parent tickets
df2 = df[df['level']=='parent']

In [5]:
# group by titles and type to see if any are ambiguous
grouped_df = df2.groupby(['title','type'],as_index=False).count()
grouped_df.rename(columns={'level':'count'},inplace=True)
grouped_df

Unnamed: 0,title,type,configuration,count
0,pagerduty: server: alwayson replica is not in...,HA-DR,4,4
1,pagerduty: server: alwayson replica is not in...,Service Failure,1,1
2,pagerduty: server: bss-dep-sql07\bsserp always...,HA-DR,1,1
3,pagerduty: server: ldsqlmid01.rimes.dir always...,,1,1
4,pagerduty: server: ukcppbnode4.rws.com alwayso...,HA-DR,1,1
5,rca - alwayson replica is not in the primary o...,Service Failure,1,1
6,rca: bss-des-sql07\bsserp availability group ...,HA-DR,1,1
7,server: ctil-sqlssis02.ctil.local disk free sp...,Space Issue,1,1
8,server: ldsqlmid01.rimes.dir disk free space: f:,Space Issue,1,1
9,server: sousq01l.itslaw.co.uk disk free space: g:,Space Issue,1,1


In [6]:
grouped_grouped_df = grouped_df.groupby('title').count()
count_df = grouped_grouped_df[grouped_grouped_df['type']>1]
count_df[['count']]

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
pagerduty: server: alwayson replica is not in the primary or secondary role - custom,2


In [7]:
# Check if we need .  and \ and any others
df2['configuration_stripped'] = df2['configuration'].str.split(pat=".").str[0]

In [8]:
def f(title, replace1, replace2):
    x = title.replace(replace1, '').replace(replace2, '')
    return x

In [9]:
df2['title_stripped'] = df2.apply(lambda x: f(x['title'], x['configuration'], x['configuration_stripped']),axis=1)

In [10]:
df2['title_stripped'] = df2['title_stripped'] \
.apply(lambda x: x.replace('server: ','')) \
.apply(lambda x: x.replace('pagerduty: ','')) \
.apply(lambda x: x.replace('rca: ','')) \
.apply(lambda x: x.replace('rca - ','')) \
.apply(lambda x: x.strip())
df2['title_stripped']

ticket_no
1                                   disk free space: f:
2               ukcppbnode4.rws.com disk free space: g:
4     alwayson replica is not in the primary or seco...
5     alwayson replica is not in the primary or seco...
6     bss-des-sql07\bsserp  availability group [alwa...
8     alwayson replica is not in the primary or seco...
9     bss-dep-sql07\bsserp alwayson replica is not i...
10             ldsqlmid01.rimes.dir disk free space: f:
11    tgsqlclust02.trakglobal.local disk free space:...
12            sousq01l.itslaw.co.uk disk free space: g:
13    alwayson replica is not in the primary or seco...
14    alwayson replica is not in the primary or seco...
16    ldsqlmid01.rimes.dir alwayson replica is not i...
17    alwayson replica is not in the primary or seco...
18    ukcppbnode4.rws.com alwayson replica is not in...
Name: title_stripped, dtype: object

In [11]:
# group by titles and type to see if any are ambiguous
grouped_df = df2.groupby(['title_stripped','type'],as_index=False).count()
grouped_df.rename(columns={'level':'count'})
with pd.option_context('display.max_colwidth', None):
  display(grouped_df)

Unnamed: 0,title_stripped,type,title,configuration,level,configuration_stripped
0,alwayson replica is not in the primary or secondary role - custom,HA-DR,4,4,4,4
1,alwayson replica is not in the primary or secondary role - custom,Service Failure,1,1,1,1
2,alwayson replica is not in the primary or secondary role on publicsqlf,Service Failure,1,1,1,1
3,bss-dep-sql07\bsserp alwayson replica is not in the primary or secondary role - custom,HA-DR,1,1,1,1
4,bss-des-sql07\bsserp availability group [alwayson replica is not in the primary or secondary],HA-DR,1,1,1,1
5,disk free space: f:,Space Issue,1,1,1,1
6,ldsqlmid01.rimes.dir alwayson replica is not in the primary or secondary role - custom,,1,1,1,1
7,ldsqlmid01.rimes.dir disk free space: f:,Space Issue,1,1,1,1
8,sousq01l.itslaw.co.uk disk free space: g:,Space Issue,1,1,1,1
9,tgsqlclust02.trakglobal.local disk free space: cluster disk 16_k:,Space Issue,1,1,1,1


In [12]:
grouped_grouped_df = grouped_df.groupby('title_stripped').count()
count_df = grouped_grouped_df[grouped_grouped_df['type']>1]
count_df = count_df[['type']]
count_df.rename(columns={'type':'count'}, inplace='True')
count_df

Unnamed: 0_level_0,count
title_stripped,Unnamed: 1_level_1
alwayson replica is not in the primary or secondary role - custom,2


In [13]:
df2['title_stripped'] = df2['title_stripped'].str.split(pat=".local").str[-1]
df2['title_stripped'] = df2['title_stripped'].str.split(pat=".com").str[-1]
df2['title_stripped'] = df2['title_stripped'].str.split(pat=".co.uk").str[-1]
df2['title_stripped'] = df2['title_stripped'].str.strip()
with pd.option_context('display.max_colwidth', None):
  display(df2['title_stripped'])

ticket_no
1                                                                                disk free space: f:
2                                                                                disk free space: g:
4                                  alwayson replica is not in the primary or secondary role - custom
5                                  alwayson replica is not in the primary or secondary role - custom
6     bss-des-sql07\bsserp  availability group [alwayson replica is not in the primary or secondary]
8                                  alwayson replica is not in the primary or secondary role - custom
9             bss-dep-sql07\bsserp alwayson replica is not in the primary or secondary role - custom
10                                                          ldsqlmid01.rimes.dir disk free space: f:
11                                                               disk free space: cluster disk 16_k:
12                                                                               

In [14]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

common_titles = list(df2['title_stripped'])

cols_to_add = len(common_titles)
for count, value in enumerate(common_titles):
    df2['similarity_col_' + str(count)] = df2['title_stripped'].apply(lambda x: similar(x,value))

In [15]:
common_titles_matrix = df2.copy()
common_titles_matrix = common_titles_matrix[['title_stripped']].drop_duplicates()
common_titles = list(common_titles_matrix['title_stripped'])

cols_to_add = len(common_titles)
for count, value in enumerate(common_titles):
    common_titles_matrix[str(value)] = common_titles_matrix['title_stripped'].apply(lambda x: similar(x,value))
    
common_titles_matrix.set_index('title_stripped',inplace=True)

In [21]:
common_titles_matrix

Unnamed: 0_level_0,disk free space: f:,disk free space: g:,alwayson replica is not in the primary or secondary role - custom,bss-des-sql07\bsserp availability group [alwayson replica is not in the primary or secondary],bss-dep-sql07\bsserp alwayson replica is not in the primary or secondary role - custom,ldsqlmid01.rimes.dir disk free space: f:,disk free space: cluster disk 16_k:,alwayson replica is not in the primary or secondary role on publicsqlf,ldsqlmid01.rimes.dir alwayson replica is not in the primary or secondary role - custom
title_stripped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
disk free space: f:,1.0,0.947368,0.190476,0.141593,0.171429,0.644068,0.666667,0.2,0.209524
disk free space: g:,0.947368,1.0,0.190476,0.141593,0.171429,0.610169,0.666667,0.177778,0.209524
alwayson replica is not in the primary or secondary role - custom,0.214286,0.214286,1.0,0.641509,0.860927,0.266667,0.28,0.882353,0.860927
bss-des-sql07\bsserp availability group [alwayson replica is not in the primary or secondary],0.159292,0.088496,0.641509,1.0,0.788889,0.19403,0.155039,0.618182,0.655556
bss-dep-sql07\bsserp alwayson replica is not in the primary or secondary role - custom,0.171429,0.171429,0.860927,0.788889,1.0,0.238095,0.231405,0.764331,0.848837
ldsqlmid01.rimes.dir disk free space: f:,0.644068,0.610169,0.285714,0.179104,0.253968,1.0,0.48,0.108108,0.460317
disk free space: cluster disk 16_k:,0.666667,0.666667,0.26,0.186047,0.231405,0.48,1.0,0.245283,0.264463
alwayson replica is not in the primary or secondary role on publicsqlf,0.222222,0.2,0.852941,0.618182,0.738854,0.27027,0.188679,1.0,0.738854
ldsqlmid01.rimes.dir alwayson replica is not in the primary or secondary role - custom,0.190476,0.190476,0.860927,0.644444,0.837209,0.47619,0.247934,0.764331,1.0


In [43]:
fig = px.imshow(common_titles_matrix)
fig.update_xaxes(side="top")
fig.update_layout(width=400, height=400, margin=dict(l=10, r=10, b=10, t=10))
fig.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig.show()
print(list(range(common_titles_matrix.shape[0])))

[0, 1, 2, 3, 4, 5, 6, 7, 8]


In [17]:
# Concatenate/cluster stripped titles
columns = list(common_titles_matrix.columns)

sub = []
for row, ind in common_titles_matrix.iterrows():
    for pos, col in enumerate(columns):
        if ind[col] < 1 and ind[col] > 0.8:
            sub.append([row,columns[pos]])
            
sub2 = []
for i in sub:
    sub2.append(sorted(i))
sub2.sort()

sub3 = list(sub2 for sub2,_ in itertools.groupby(sub2))
sub3

[['alwayson replica is not in the primary or secondary role - custom',
  'alwayson replica is not in the primary or secondary role on  publicsqlf'],
 ['alwayson replica is not in the primary or secondary role - custom',
  'bss-dep-sql07\\bsserp alwayson replica is not in the primary or secondary role - custom'],
 ['alwayson replica is not in the primary or secondary role - custom',
  'ldsqlmid01.rimes.dir alwayson replica is not in the primary or secondary role - custom'],
 ['bss-dep-sql07\\bsserp alwayson replica is not in the primary or secondary role - custom',
  'ldsqlmid01.rimes.dir alwayson replica is not in the primary or secondary role - custom'],
 ['disk free space: f:', 'disk free space: g:']]

In [25]:
from pprint import pprint
# https://stackoverflow.com/questions/38862657/find-value-greater-than-level-python-pandas
def long_substr(data):
    substrs = lambda x: {x[i:i+j] for i in range(len(x)) for j in range(len(x) - i + 1)}
    s = substrs(data[0])
    for val in data[1:]:
        s.intersection_update(substrs(val))
    return max(s, key=len)

sub4 = []
for i in sub3:
    sub4.append(long_substr(i))
sub4 = [i.strip() for i in sub4]

sub4 = list(set(sub4))
pprint(sub4)

['alwayson replica is not in the primary or secondary role',
 'alwayson replica is not in the primary or secondary role - custom',
 'disk free space:']


In [19]:
def ratios(title):
    df3 = df2[df2['title_stripped'].str.contains(title)][['type','title']].groupby('type').count()
    df3.rename(columns={'title':'count'},inplace=True)
    return df3

sub4 = [i.strip() for i in sub4]

sub4 = list(set(sub4))

for i in sub4:
    print(i)
    df4 = ratios(i)
    print(df4['count'].max())
    print(df4['count'].sum())
    print()

alwayson replica is not in the primary or secondary role
6
9

alwayson replica is not in the primary or secondary role - custom
6
8

disk free space:
5
5



In [20]:
# Heatmap to show
# group cols - we want a linkage between related stripped titles.
# def relation_matrix(titles):

titles = sub4
df_x = pd.DataFrame(titles)

for i in titles:
    print()    
    print('compare:',i)
    for j in titles:
        print(j)
    # common_titles_matrix[str(value)] = common_titles_matrix['title_stripped'].apply(lambda x: similar(x,value))
        df_x[str(i)] = similar(j,i)
        print(similar(j,i))
    
    
df_x



compare: alwayson replica is not in the primary or secondary role
alwayson replica is not in the primary or secondary role
1.0
alwayson replica is not in the primary or secondary role - custom
0.9256198347107438
disk free space:
0.2222222222222222

compare: alwayson replica is not in the primary or secondary role - custom
alwayson replica is not in the primary or secondary role
0.9256198347107438
alwayson replica is not in the primary or secondary role - custom
1.0
disk free space:
0.19753086419753085

compare: disk free space:
alwayson replica is not in the primary or secondary role
0.25
alwayson replica is not in the primary or secondary role - custom
0.2222222222222222
disk free space:
1.0


Unnamed: 0,0,alwayson replica is not in the primary or secondary role,alwayson replica is not in the primary or secondary role - custom,disk free space:
0,alwayson replica is not in the primary or seco...,0.222222,0.197531,1.0
1,alwayson replica is not in the primary or seco...,0.222222,0.197531,1.0
2,disk free space:,0.222222,0.197531,1.0


In [None]:
# Apply tag for max(similarity_col)