<a href="https://colab.research.google.com/github/IlgarNasirov/MaliciousScriptDetectorWithML/blob/main/MaliciousScriptDetectorWithML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [103]:
import pandas as pd
data = pd.read_json('maliciousScriptDataset.json')
data.columns = ['Value', 'Is malicious']
data

Unnamed: 0,Value,Is malicious
0,Hello world,0
1,<script>alert('1')</script>,1
2,Welcome to our website,0
3,<script>document.location='http://attacker.com...,1
4,Thank you for your submission,0
...,...,...
899,"<script>setTimeout('alert(""XSS"")',0)</script>",1
900,Bugatti,0
901,<script>['alert'].forEach(alert=>{window[alert...,1
902,Rolls-Royce,0


In [104]:
data.drop_duplicates(inplace = True)
data.reset_index(inplace = True, drop = True)
data

Unnamed: 0,Value,Is malicious
0,Hello world,0
1,<script>alert('1')</script>,1
2,Welcome to our website,0
3,<script>document.location='http://attacker.com...,1
4,Thank you for your submission,0
...,...,...
890,"<script>setTimeout('alert(""XSS"")',0)</script>",1
891,Bugatti,0
892,<script>['alert'].forEach(alert=>{window[alert...,1
893,Rolls-Royce,0


In [105]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

x = data['Value']
y = data['Is malicious']
test_data = []

In [106]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = 'char', ngram_range = (2, 6))),
    ('clf', LogisticRegression())
])

pipeline.fit(x_train, y_train)
y_prediction = pipeline.predict(x_test)

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  result = pipeline.predict([row])[0]
  if result:
    print(row + ' - is malicious!')
  else:
    print(row + ' - is not malicious!')

95.53072625698324
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        97
           1       1.00      0.90      0.95        82

    accuracy                           0.96       179
   macro avg       0.96      0.95      0.95       179
weighted avg       0.96      0.96      0.96       179



In [107]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

def custom_tokenizer(text):
  return re.findall(
        r"(?ix)[a-z_]\w+|\d+|<\s*script[^>]*>.*?<\s*/\s*script\s*>|on\w+\s*=\s*['\"]?.*?['\"]?|['\"][\s]*or[\s]+['\"]?1['\"]?=['\"]?1|--|/\*|\*/|#|\b(?:select|union|insert|delete|update|drop|from|where|script|alert|prompt|confirm|src|eval|iframe|cookie)\b|[<>{}()[\];'\"=.,:+\-*/\\!?&|%]|%[0-9a-fA-F]{2}",
        text
  )

count_vectorizer = CountVectorizer(tokenizer = custom_tokenizer, token_pattern = None)
x_vectorizer = count_vectorizer.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_vectorizer, y, test_size = 0.2, random_state = 100)

random_forest_classifier = RandomForestClassifier(n_estimators = 100)
random_forest_classifier.fit(x_train, y_train)

y_prediction = random_forest_classifier.predict(x_test)

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  test_vectorizer = count_vectorizer.transform([row])
  result = random_forest_classifier.predict(test_vectorizer)[0]
  if result:
    print(row + ' - is malicious!')
  else:
    print(row + ' - is not malicious!')

87.15083798882681
              precision    recall  f1-score   support

           0       0.93      0.82      0.87        97
           1       0.82      0.93      0.87        82

    accuracy                           0.87       179
   macro avg       0.87      0.88      0.87       179
weighted avg       0.88      0.87      0.87       179



In [108]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

tokenizer = Tokenizer(char_level = True)
tokenizer.fit_on_texts(x_train)

x_train_sequences = tokenizer.texts_to_sequences(x_train)
x_test_sequences = tokenizer.texts_to_sequences(x_test)

x_train_pad_sequences = pad_sequences(x_train_sequences, maxlen = 100, padding = 'post')
x_test_pad_sequences = pad_sequences(x_test_sequences, maxlen = 100, padding = 'post')

sequential = Sequential()
sequential.add(Embedding(input_dim = len(tokenizer.word_index) + 1, output_dim = 32))
sequential.add(LSTM(64))
sequential.add(Dense(1, activation = 'sigmoid'))

sequential.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

early_stopping = EarlyStopping(patience = 2, restore_best_weights = True)

sequential.fit(x_train_pad_sequences, y_train, epochs = 5, validation_split = 0.2, callbacks = [early_stopping])

y_prediction = sequential.predict(x_test_pad_sequences)
y_prediction = (y_prediction > 0.5).astype('int32')

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  test_sequence = tokenizer.texts_to_sequences([row])
  test_pad_sequence = pad_sequences(test_sequence, maxlen = 100, padding = 'post')
  result = sequential.predict(test_pad_sequence)[0]
  if result > 0.5:
    print(row + ' - is malicious!')
  else:
    print(row + ' - is not malicious!')

Epoch 1/5
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 82ms/step - accuracy: 0.5742 - loss: 0.6859 - val_accuracy: 0.7153 - val_loss: 0.5907
Epoch 2/5
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.7398 - loss: 0.5255 - val_accuracy: 0.7917 - val_loss: 0.3914
Epoch 3/5
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 95ms/step - accuracy: 0.7794 - loss: 0.4324 - val_accuracy: 0.9236 - val_loss: 0.2670
Epoch 4/5
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.9497 - loss: 0.2007 - val_accuracy: 0.9653 - val_loss: 0.1549
Epoch 5/5
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.9807 - loss: 0.0872 - val_accuracy: 0.9653 - val_loss: 0.1550
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
97.20670391061452
              precision    recall  f1-score   support

           0       0.96      0.99      0.97        97
 

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import joblib

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = 'char', ngram_range = (2, 6))),
    ('clf', SVC(kernel = 'linear', C = 1))
])

pipeline.fit(x_train, y_train)
y_prediction = pipeline.predict(x_test)

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  result = pipeline.predict([row])[0]
  if result:
    print(row + ' - is malicious!')
  else:
    print(row + ' - is not malicious!')

joblib.dump(pipeline, 'malicious_script_model.pkl')

96.64804469273743
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        97
           1       1.00      0.93      0.96        82

    accuracy                           0.97       179
   macro avg       0.97      0.96      0.97       179
weighted avg       0.97      0.97      0.97       179



['malicious_script_model.pkl']

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = 'char', ngram_range = (2, 6))),
    ('clf', xgboost.XGBClassifier())
])

pipeline.fit(x_train, y_train)
y_prediction = pipeline.predict(x_test)

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  result = pipeline.predict([row])[0]
  if result:
    print(row + ' - is malicious!')
  else:
    print(row + ' - is not malicious!')

96.08938547486034
              precision    recall  f1-score   support

           0       0.93      1.00      0.97        97
           1       1.00      0.91      0.96        82

    accuracy                           0.96       179
   macro avg       0.97      0.96      0.96       179
weighted avg       0.96      0.96      0.96       179



In [111]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 100)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer = 'char', ngram_range = (2, 6))),
    ('clf', MultinomialNB())
])

pipeline.fit(x_train, y_train)
y_prediction = pipeline.predict(x_test)

print(accuracy_score(y_test, y_prediction) * 100)
print(classification_report(y_test, y_prediction))

for row in test_data:
  result = pipeline.predict([row])[0]
  print(result)

94.41340782122904
              precision    recall  f1-score   support

           0       1.00      0.90      0.95        97
           1       0.89      1.00      0.94        82

    accuracy                           0.94       179
   macro avg       0.95      0.95      0.94       179
weighted avg       0.95      0.94      0.94       179

