In [1]:
import pandas as pd
import numpy as np
import random
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

In [2]:
url_df = pd.read_csv("dataset.csv")
url_df.head()

Unnamed: 0,url,Class
0,http://m.dappsconnectify.com/sync,bad
1,https://www.esepso.caeord.com/,bad
2,https://ciei.in/HOST/tnts/jpMixedFixed?uid=aaa...,bad
3,https://ciei.in/HOST/tnts/jpMixedFixed/?uid=aa...,bad
4,https://www.epees.caeord.com/,bad


In [3]:
test_url = url_df["url"][5]

In [4]:
print(test_url)

https://mhora.com/login.html


In [5]:
train_df, test_df = train_test_split(url_df, test_size=0.2, random_state=42)

labels = train_df["Class"]
test_labels = test_df["Class"]

In [6]:
print("Training samples",len(train_df))
print("Testing samples",len(test_df))

Training samples 8760
Testing samples 2191


In [7]:
def tokenizer(url):
    tokens = re.split("[/-]",url)
    for i in tokens:
        if i.find(".")>=0:
            dot_split = i.split(".")
            if "com" in dot_split:
                dot_split.remove("com")
            if "www" in dot_split:
                dot_split.remove("www")
            
            
            tokens += dot_split
    return tokens
print("\n### Tokenizer defined ###\n")


### Tokenizer defined ###



In [8]:
print(test_url)

tokenized_url = tokenizer(test_url)
print(tokenized_url)

https://mhora.com/login.html
['https:', '', 'mhora.com', 'login.html', 'mhora', 'login', 'html']


In [9]:
cVec = CountVectorizer(tokenizer=tokenizer)
count_X = cVec.fit_transform(train_df["url"])

tVec = TfidfVectorizer(tokenizer=tokenizer)
tfidf_X = tVec.fit_transform(train_df["url"])

print("Complete")

Complete


In [10]:
exvec = CountVectorizer(tokenizer=tokenizer)
ext = exvec.fit_transform([test_url])
print(ext)

print()
print("=" * 50)
print()

extec = TfidfVectorizer(tokenizer=tokenizer)
ext = extec.fit_transform([test_url])

print(ext)

  (0, 2)	1
  (0, 0)	1
  (0, 6)	1
  (0, 4)	1
  (0, 5)	1
  (0, 3)	1
  (0, 1)	1


  (0, 1)	0.3779644730092272
  (0, 3)	0.3779644730092272
  (0, 5)	0.3779644730092272
  (0, 4)	0.3779644730092272
  (0, 6)	0.3779644730092272
  (0, 0)	0.3779644730092272
  (0, 2)	0.3779644730092272


In [11]:
test_count_x = cVec.transform(test_df["url"])

test_tfidf_x = tVec.transform(test_df["url"])

In [12]:
def generate_report(cmatrix, score, creport):
    cmatrix = cmatrix.T
    plt.figure(figsize=(5,5))
    sns.heatmap(cmatrix,
                annot=True,
                fmt="d",
                linewidths=.5,
                sqaure = True,
                cmap = "Blues",
                annot_kws={"size": 16},
                xticklabels=["bad","good"],
                yticklabels=["bad","good"])
    plt.xticks(rotation="horizontal", fontsize=16)
    plt.yticks(rotation="horizontal", fontsize=16)
    plt.xlabel("Actual Label",size=20)
    plt.ylabel("Predicted Label",size=20)
    
    title = "Accuracy Score: {0: .4f}".format(score)
    plt.title(title, size=20);
    
    print(creport)
    plt.show()

In [14]:
lgs = LogisticRegression()
lgs.fit(count_X, labels)

score = lgs.score(test_count_x, test_labels)
predictions = lgs.predict(test_count_x)
cmatrix = confusion_matrix(test_labels, predictions)
creport = classification_report(test_labels, predictions)

creport

'              precision    recall  f1-score   support\n\n         bad       1.00      1.00      1.00       588\n        good       1.00      1.00      1.00      1603\n\n    accuracy                           1.00      2191\n   macro avg       1.00      1.00      1.00      2191\nweighted avg       1.00      1.00      1.00      2191\n'

In [None]:
lgs.score(test_count_x, test_labels)

In [17]:
score

0.9995435874030123

In [18]:
print(classification_report(test_labels, predictions))

              precision    recall  f1-score   support

         bad       1.00      1.00      1.00       588
        good       1.00      1.00      1.00      1603

    accuracy                           1.00      2191
   macro avg       1.00      1.00      1.00      2191
weighted avg       1.00      1.00      1.00      2191



In [21]:
predictions

array(['good', 'bad', 'good', ..., 'good', 'bad', 'good'], dtype=object)