In [1]:
import pandas as pd

In [2]:
unknotted = pd.read_csv("./Rossmann_unknotted.csv")
knotted = pd.read_csv("./SPOUT_knotted.csv", names=['seq'], skiprows=1)

len(knotted), len(unknotted)

(140298, 108146)

In [3]:
def count_pattern(pattern, multiple_matches=False):
    if multiple_matches:
        return unknotted.seq.str.count(pattern).sum(), knotted['seq'].str.count(pattern).sum()
    else:
        return (unknotted.seq.str.count(pattern) > 0).sum(), (knotted['seq'].str.count(pattern) > 0).sum(), 
        

count_pattern(r'SLN', multiple_matches=True)

(8399, 67579)

In [4]:
count_pattern(r'G.E..GL')

(1058, 42933)

## Logo as feature

In [5]:
patterns = [
r'N.G...R',
r'SLN',
r'G.E..GL',
r'DD...GGG.GM',
r'LL..P.YTRP',
r'S.GD.VL.GGE',
r'EQ.YR',
r'[RK][AV]S[MA]G',
r'CG.YEG.D.R',
r'I..WR',
r'G.FA....D',
r'GSV',
r'G.G.DE',
r'SGG.DS',
r'D.S.....A',
r'DL.V.G..G',
r'E.R.P.LD',
r'GISG...H',
r'HNR.R',
r'NGE.YN']

for p in patterns:
    print(p, count_pattern(p))

N.G...R (3605, 71485)
SLN (7722, 64787)
G.E..GL (1058, 42933)
DD...GGG.GM (0, 15943)
LL..P.YTRP (0, 10906)
S.GD.VL.GGE (0, 12972)
EQ.YR (188, 11297)
[RK][AV]S[MA]G (292, 9697)
CG.YEG.D.R (0, 10905)
I..WR (2106, 14859)
G.FA....D (4623, 366)
GSV (22886, 12818)
G.G.DE (8643, 439)
SGG.DS (22158, 136)
D.S.....A (28378, 6717)
DL.V.G..G (5748, 46)
E.R.P.LD (5053, 19)
GISG...H (2503, 23)
HNR.R (2148, 38)
NGE.YN (4372, 3)


In [6]:
knotted['knot'] = 1
unknotted['knot'] = 0

df = pd.concat([knotted, unknotted], ignore_index=True).sample(frac=1, random_state=42)
df

Unnamed: 0,seq,knot
49580,MNMLFLDPHDIGSPLPRADARVRHVLKVLKKGPGDELEAGTPDGML...,1
161330,MFRSKYIFETKRTKFFRYFKNTLFVLTTFLLIYLLLGISFIRFSDD...,0
65505,MTRRYYCPSLPQFGGNVVLQGHEAQHAIRVMRVKPDDAITLFDGQG...,1
75843,MLSEKEKNKIRSLSPNRRIEILYGLLRSNEKELSDFEAKDFLKAQF...,1
21837,MIKVYVIGKPKTKFIKTGLEQYLKWTSKYDRVELITLPLSDDLNKI...,1
...,...,...
119879,MADLIALDDPRDERLRDYTDLRDVQLRQSVERERGIYIAEGTKVIE...,1
103694,MDLRIALYQPDIPGNTGTILRMAACLGFAVDLIEPAGFDVSDRSLK...,1
131932,METSKSSARQRADSIRPYRCKNLIAVLENPIDIRNIGTVIRNVNAL...,1
146867,MSYRRDLTTQEAVIARMTATMARRGPDAGGVWIDRHVALGHRRLAV...,0


In [7]:
for i, p in enumerate(patterns):
    df['feature' + str(i)] = (df.seq.str.count(p) > 0).astype(int)
df

Unnamed: 0,seq,knot,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature10,feature11,feature12,feature13,feature14,feature15,feature16,feature17,feature18,feature19
49580,MNMLFLDPHDIGSPLPRADARVRHVLKVLKKGPGDELEAGTPDGML...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
161330,MFRSKYIFETKRTKFFRYFKNTLFVLTTFLLIYLLLGISFIRFSDD...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65505,MTRRYYCPSLPQFGGNVVLQGHEAQHAIRVMRVKPDDAITLFDGQG...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75843,MLSEKEKNKIRSLSPNRRIEILYGLLRSNEKELSDFEAKDFLKAQF...,1,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
21837,MIKVYVIGKPKTKFIKTGLEQYLKWTSKYDRVELITLPLSDDLNKI...,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,MADLIALDDPRDERLRDYTDLRDVQLRQSVERERGIYIAEGTKVIE...,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103694,MDLRIALYQPDIPGNTGTILRMAACLGFAVDLIEPAGFDVSDRSLK...,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
131932,METSKSSARQRADSIRPYRCKNLIAVLENPIDIRNIGTVIRNVNAL...,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146867,MSYRRDLTTQEAVIARMTATMARRGPDAGGVWIDRHVALGHRRLAV...,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0


## Logistic regression

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [10]:
x = df.iloc[:,2:]
y = df.loc[:,"knot"] 

model.fit(x, y);

In [11]:
conf = pd.DataFrame(confusion_matrix(y, model.predict(x)), columns=['no-knot predicted', 'knot predicted'], 
             index=['unknotted', 'knotted'])
conf.name = "Confusion matrix"
conf.index.name = "actual"

conf

Unnamed: 0_level_0,no-knot predicted,knot predicted
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
unknotted,102507,5639
knotted,35058,105240


In [12]:
#df['nG'] = df.seq.str.count('G')
#df['length'] = df.seq.str.len()

In [13]:
# accuracy
conf.to_numpy().diagonal().sum() / conf.to_numpy().sum()

0.8361924618827583