In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
print(os.listdir("../data"))

['urldata.csv']


In [3]:
urldata = pd.read_csv("../data/urldata.csv")

In [4]:
urldata.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [5]:
#removing unnamed column
urldata = urldata.drop('Unnamed: 0',axis=1)

In [6]:
urldata.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [7]:
urldata.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
 2   result  450176 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ MB


In [8]:
#checking for missing values
urldata.isnull().sum()

url       0
label     0
result    0
dtype: int64

In [9]:
from urllib.parse import urlparse

In [10]:
from tld import get_tld

In [11]:
#length of url
urldata['url_length'] = urldata['url'].apply(lambda i: len(str(i)))

In [12]:
urldata['hostname_length'] = urldata['url'].apply(lambda i: len(urlparse(i).netloc))

In [13]:
#Length of Top Level Domain
urldata['tld'] = urldata['url'].apply(lambda i: get_tld(i,fail_silently=True))
def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

urldata['tld_length'] = urldata['tld'].apply(lambda i: tld_length(i))

In [14]:
urldata.head()

Unnamed: 0,url,label,result,url_length,hostname_length,tld,tld_length
0,https://www.google.com,benign,0,22,14,com,3
1,https://www.youtube.com,benign,0,23,15,com,3
2,https://www.facebook.com,benign,0,24,16,com,3
3,https://www.baidu.com,benign,0,21,13,com,3
4,https://www.wikipedia.org,benign,0,25,17,org,3


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [16]:
x = urldata[['url_length', 'hostname_length', 'tld_length']]

#Target Variable
y = urldata['result']

In [17]:
x.shape

(450176, 3)

In [18]:
y.shape

(450176,)

In [19]:
#Splitting the data into Training and Testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.3, random_state=42)

In [21]:
#Logistic Regression
log_model=LogisticRegression()
log_model.fit(x_train,y_train)

log_predictions = log_model.predict(x_test)
accuracy_score(y_test,log_predictions)

0.7720516368159835

In [22]:
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

rfc_predictions = rfc.predict(x_test)
accuracy_score(y_test, rfc_predictions)

0.8494846473134385