<a href="https://colab.research.google.com/github/MahSrb/API/blob/main/TokenVec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pathlib
import pandas as pd

BASE_DIR = pathlib.Path().resolve().parent
EXPORT_DIR = BASE_DIR / "exports"
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
TRAINING_DATA_PATH = EXPORT_DIR / "training-data.pkl"
METADATA_EXPORT_PATH = EXPORT_DIR / "metadata.pkl"
TOKENIZER_EXPORT_PATH = EXPORT_DIR / "tokenizer.json"
METADATA_EXPORT_PATH = EXPORT_DIR / "metadata.json"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
url='https://drive.google.com/file/d/1TkdipCKlDRPJGzVcavRnMYdjWOK9ev31/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
dataset = pd.read_csv(url)
print(dataset)

                                                Sentence  Label   LP  NSPA  \
0                      " or pg_sleep  (  __TIME__  )  --      1   33     6   
1       AND 1  =  utl_inaddr.get_host_address   (    ...      1  218    35   
2       select * from users where id  =  '1' or @ @1 ...      1   90    20   
3       select * from users where id  =  1 or 1#"  ( ...      1   85    18   
4       select name from syscolumns where id   =     ...      1  109    18   
...                                                  ...    ...  ...   ...   
30603              DELETE FROM door WHERE grow = 'small'      0   37     6   
30604                               DELETE FROM tomorrow      0   20     2   
30605                       SELECT wide ( s )  FROM west      0   28     6   
30606       SELECT * FROM  ( SELECT slide FROM breath )       0   44     9   
30607                           SELECT TOP 3 * FROM race      0   24     5   

           RSPA  NSPE      RSPE  NK  KWS       ROC  
0      0.1

In [None]:
labels = dataset["Label"].tolist()
texts = dataset["Sentence"].tolist() 

In [None]:
labels[120],texts[120]

(1, '\x18 or 3  =  3 --')

In [None]:
label_legend = { 1 : "injection" , 0 : "notinjection" }
label_legend_invertor =  {f"{v}" : k for k , v in label_legend.items ()}
label_legend_invertor

{'injection': 1, 'notinjection': 0}

In [None]:
labels_as_string = [label_legend[x] for x in labels]
label_legend_invertor[labels_as_string[30000]]

0

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer   

In [None]:
MAX_NUM_WORDS =500

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
#sequences

In [None]:
word_index = tokenizer.word_index
#word_index

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
MAX_SEQ_LENGTH = 300

In [None]:
x = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
x

array([[  0,   0,   0, ..., 118,  33, 363],
       [  0,   0,   0, ...,   5, 364, 365],
       [  0,   0,   0, ...,   3, 107,  15],
       ...,
       [  0,   0,   0, ...,   1,  29,   2],
       [  0,   0,   0, ...,   2,   1,   2],
       [  0,   0,   0, ...,  38,  20,   2]], dtype=int32)

In [None]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [None]:
labels_as_str_array = np.asarray (labels_as_string)
labels_as_str_array

array(['injection', 'injection', 'injection', ..., 'notinjection',
       'notinjection', 'notinjection'], dtype='<U12')

In [None]:
y = to_categorical(dataset.Label)
y

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size= 0.33 , random_state = 42)

In [None]:
training_data = {
    "X_train" : X_train,
    "X_test" : X_test,
    "y_train" : y_train,
    "y_test" : y_test,
    "max_word" : MAX_NUM_WORDS,
    "max_seq_length" : MAX_SEQ_LENGTH,
    "label_legend" : label_legend,
    "label_legend_inverted" : label_legend_invertor,
}
tokenizer_json = tokenizer.to_json()
#TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

In [None]:
with open (METADATA_EXPORT_PATH, 'wb') as f: 
  pickle.dump(training_data, f)

In [None]:
with open(TRAINING_DATA_PATH , 'wb') as f:
    pickle.dump(training_data, f)

In [None]:
data = {}
with open (TRAINING_DATA_PATH, 'rb') as f:
  data = pickle.load(f)

In [None]:
data

{'X_test': array([[  0,   0,   0, ..., 282, 319,  19],
        [  0,   0,   0, ...,   0,   0,   0],
        [  0,   0,   0, ...,   0,   0,   0],
        ...,
        [  0,   0,   0, ..., 100,   5, 127],
        [  0,   0,   0, ..., 233, 233,   5],
        [  0,   0,   0, ...,   1,   2,   4]], dtype=int32),
 'X_train': array([[  0,   0,   0, ...,   2, 117,  36],
        [  0,   0,   0, ..., 125, 120, 104],
        [  0,   0,   0, ...,   7, 101,   2],
        ...,
        [  0,   0,   0, ...,   6,  74,  28],
        [  0,   0,   0, ...,   0,   0,   0],
        [  0,   0,   0, ...,   1,  78, 453]], dtype=int32),
 'label_legend': {0: 'notinjection', 1: 'injection'},
 'label_legend_inverted': {'injection': 1, 'notinjection': 0},
 'max_seq_length': 300,
 'max_word': 500,
 'y_test': array([[0., 1.],
        [1., 0.],
        [1., 0.],
        ...,
        [1., 0.],
        [0., 1.],
        [1., 0.]], dtype=float32),
 'y_train': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        ...

In [None]:
import json
metadata = {
    "label_legend_inverted" : label_legend_invertor,
    "legend" : label_legend,
    "max_sequence" : MAX_SEQ_LENGTH,
    "max_words" : MAX_NUM_WORDS,
}
METADATA_EXPORT_PATH.write_text(json.dumps(metadata, indent=4))

214

In [None]:
from google.colab import files
files.download('/exports/training-data.pkl')

In [None]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

model = {}
with open ('/content/model.pkl', 'rb') as f:
  model = pickle.load(f)
  model.summary()
features = {}
with open ('/content/features.pkl', 'rb') as f:
  features = pickle.load(f)
  print(features)

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 8)                 72        
                                                                 
 dense_46 (Dense)            (None, 100)               900       
                                                                 
 dense_47 (Dense)            (None, 100)               10100     
                                                                 
 dense_48 (Dense)            (None, 100)               10100     
                                                                 
 dense_49 (Dense)            (None, 1)                 101       
                                                                 
Total params: 21,273
Trainable params: 21,273
Non-trainable params: 0
_________________________________________________________________
['LP', 'NSPA', 'RSPA', 'NSPE', 'RSPE', 'NK', 'KWS'

In [None]:
def predict(LP, NSPA, RSPA, NSPE, RSPE, NK, KWS, ROC):
  list =[LP, NSPA, RSPA, NSPE, RSPE, NK, KWS, ROC]
  dataframe = pd.DataFrame(columns = features)
  dataframe.loc[len(dataframe)] = list
  print (dataframe)
  X_output = model.predict(dataframe)
  top_y_index = np.round(X_output)
  for keys in label_legend_invertor.keys():
    if label_legend_invertor[keys]== top_y_index:
     print(keys)

In [None]:
predict(33,6,0.181818,3,0.090909,1,1,0.727273)

     LP  NSPA      RSPA  NSPE      RSPE   NK  KWS       ROC
0  33.0   6.0  0.181818   3.0  0.090909  1.0  1.0  0.727273
injection
