**Import the Necessary Libraries**

In [2]:
!pip install imblearn

Collecting imblearn
  Using cached imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     -------------------------------------- 226.0/226.0 kB 1.5 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     -------------------------------------- 298.0/298.0 kB 1.7 MB/s eta 0:00:00
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.10.1 imblearn-0.0 joblib-1.2.0


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle
from sklearn.preprocessing import StandardScaler

**Load the Dataset**

In [19]:
df=pd.read_csv('urldata.csv')
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


**Exploring the Data**

In [None]:
df=pd.read_csv('urldata.csv')
df.head()


In [12]:
print('Shape of the data',df.shape)

Shape of the data (450176, 3)


In [13]:
print('Summary of the data:',df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450176 entries, 0 to 450175
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     450176 non-null  object
 1   label   450176 non-null  object
 2   result  450176 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.3+ MB
Summary of the data: None


In [14]:
print('Null Values in the data:\n',df.isnull().sum())

Null Values in the data:
 url       0
label     0
result    0
dtype: int64


**Removing the label column**

In [20]:
df=df.drop('label',axis=1)
df.head()

Unnamed: 0,url,result
0,https://www.google.com,0
1,https://www.youtube.com,0
2,https://www.facebook.com,0
3,https://www.baidu.com,0
4,https://www.wikipedia.org,0


In [21]:
df['result'].value_counts()

0    345738
1    104438
Name: result, dtype: int64

*Data is Imbalanced*

**Tokenizing the URL**

In [23]:
def tokenizer(i):
    x=re.split('[/-]',i)
    if '' in x:
        x.remove('')
    for j in x:
        if j.find('.')>=0:
            y=j.split('.')
            if 'com' in y:
                y.remove('com')
            x=x+y
    return x

**Using Vectorizer**

In [None]:
cf=CountVectorizer(tokenizer=tokenizer)
tf=TfidfVectorizer(tokenizer=tokenizer)
with open('vector','wb') as f:
    pickle.dump(tf,f)

In [24]:
cf=CountVectorizer(tokenizer=tokenizer)

In [25]:
tf=TfidfVectorizer(tokenizer=tokenizer)

In [26]:
with open('vector','wb') as f:
    pickle.dump(tf,f)

In [27]:
x1=cf.fit_transform(df['url'])

In [28]:
x2=tf.fit_transform(df['url'])

In [30]:
y=df['result']
y

0         0
1         0
2         0
3         0
4         0
         ..
450171    1
450172    1
450173    1
450174    1
450175    1
Name: result, Length: 450176, dtype: int64

**Since our data is imbalanced we can try oversampling**

In [31]:
sm=SMOTE(random_state=42)

In [32]:
x1_new,y1_new=sm.fit_resample(x1,y)

In [33]:
x2_new,y2_new=sm.fit_resample(x2,y)

In [34]:
cv=ShuffleSplit(n_splits=10,test_size=0.3,random_state=42)

## Trying different ML Algorithms to check for accuracy

 **Using Counter Vectorizer without oversampling**

In [44]:
print(cross_val_score(LogisticRegression(max_iter=500),x1,y,cv=cv,scoring='accuracy'))

[0.99755651 0.99754911 0.99763796 0.99783048 0.99763796 0.99780086
 0.99778605 0.99771201 0.99760835 0.99787491]


In [36]:
print(cross_val_score(MultinomialNB(),x1,y,cv=cv,scoring='accuracy'))

[0.99677164 0.99662355 0.99649767 0.99656431 0.99678645 0.99659393
 0.99649027 0.99658653 0.99653469 0.99676423]


**Using Counter Vectorizer with oversampling**

In [37]:
print(cross_val_score(LogisticRegression(max_iter=500),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.99690999 0.99703533 0.99685697 0.99689071 0.99678948 0.99726672
 0.9968184  0.99701605 0.99683286 0.99699194]


In [38]:
print(cross_val_score(MultinomialNB(),x1_new,y1_new,cv=cv,scoring='accuracy'))

[0.99017561 0.9905227  0.98994423 0.99026721 0.99004546 0.99059019
 0.99055162 0.99034434 0.99017561 0.99037326]


**Using Tfidf Vectorizer without oversampling**

In [39]:
print(cross_val_score(MultinomialNB(),x2,y,cv=cv,scoring='accuracy'))

[0.98836753 0.98850822 0.98808616 0.98856745 0.98822684 0.9881602
 0.98852302 0.98853043 0.98827127 0.98822684]


In [40]:
print(cross_val_score(LogisticRegression(max_iter=500),x2,y,cv=cv,scoring='accuracy'))

[0.99446884 0.99428373 0.99449846 0.99443182 0.994728   0.99451327
 0.99462433 0.99437258 0.99422449 0.99485387]


**Using Tfidf Vectorizer with oversampling**

In [41]:
print(cross_val_score(MultinomialNB(),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.99091799 0.99137594 0.99095173 0.99072998 0.99100476 0.99098548
 0.99135666 0.99087942 0.99081675 0.99105296]


In [42]:
print(cross_val_score(LogisticRegression(max_iter=500),x2_new,y2_new,cv=cv,scoring='accuracy'))

[0.99733903 0.99736313 0.9972378  0.99730046 0.99719923 0.99744026
 0.99738241 0.99735349 0.9971462  0.99727154]


**Creating and Saving the model**

In [88]:
nb=MultinomialNB()

In [85]:
x_train,x_test,y_train,y_test=train_test_split(x2_new,y2_new,test_size=0.3,random_state=42)

In [89]:
nb.fit(x_train,y_train)

MultinomialNB()

In [75]:
prediction=nb.predict(x_test)

In [76]:
nb.score(x_train,y_train)

0.9971799443426378

In [77]:
confusion_matrix(prediction,y_test)

array([[101922,    162],
       [  1722, 103637]], dtype=int64)

In [78]:
print(classification_report(prediction,y_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    102084
           1       1.00      0.98      0.99    105359

    accuracy                           0.99    207443
   macro avg       0.99      0.99      0.99    207443
weighted avg       0.99      0.99      0.99    207443



In [79]:
with open('url','wb') as f:
    pickle.dump(nb,f)

In [53]:
with open('vector','rb') as f:
    vectorizer=pickle.load(f)

In [80]:
def predict(a):
    # Initialize a new vectorizer object
    vectorizer = TfidfVectorizer()

    # Fit the vectorizer object on some training data
    vectorizer.fit(x_train)

    # Transform the new data using the fitted vectorizer object
    v = vectorizer.transform([a])

    # Make a prediction using the trained model
    p = model.predict(v)

    return p


In [81]:
predict('wikipedia.com')

AttributeError: lower not found