In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pickle
from sklearn.pipeline import make_pipeline

In [2]:
data=pd.read_csv("../artifacts/dataset.csv")

In [3]:
data.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [4]:
data.tail()

Unnamed: 0,URL,Label
549341,23.227.196.215/,bad
549342,apple-checker.org/,bad
549343,apple-iclods.org/,bad
549344,apple-uptoday.org/,bad
549345,apple-search.info,bad


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [6]:
data.isnull().sum()

URL      0
Label    0
dtype: int64

In [7]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
data.URL[0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [8]:
# this will be pull letter which matches to expression
tokenizer.tokenize(data.URL[0]) 

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [9]:
data['text_tokenized'] = data.URL.map(lambda t: tokenizer.tokenize(t)) # doing with all rows

In [10]:
data.sample(5)

Unnamed: 0,URL,Label,text_tokenized
281706,apartments.com/Pennsylvania,good,"[apartments, com, Pennsylvania]"
445436,theinnkeeper.com/bnb/search?region=Southern+In...,good,"[theinnkeeper, com, bnb, search, region, South..."
325615,f1planet.co.uk/f1-tracks/british-grand-prix/,good,"[f, planet, co, uk, f, tracks, british, grand,..."
256030,voicesofthegulag.blogspot.com/,good,"[voicesofthegulag, blogspot, com]"
464436,wickedlocal.com/medfield/news,good,"[wickedlocal, com, medfield, news]"


In [11]:
stemmer = SnowballStemmer("english") # choose language
data['text_stemmed'] =data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
313010,dcnonl.com/nw/20296,good,"[dcnonl, com, nw]","[dcnonl, com, nw]"
188956,footbal.wikia.com/wiki/Kansas_City_Chiefs,good,"[footbal, wikia, com, wiki, Kansas, City, Chiefs]","[footbal, wikia, com, wiki, kansa, citi, chief]"
59428,www.ntu.edu.sg/sce/,good,"[www, ntu, edu, sg, sce]","[www, ntu, edu, sg, sce]"
257446,web.me.com/hellenicgenealogy/Site/Interview_-_...,good,"[web, me, com, hellenicgenealogy, Site, Interv...","[web, me, com, hellenicgenealog, site, intervi..."
197396,hollywoodscandalsgossip.blogspot.com/,good,"[hollywoodscandalsgossip, blogspot, com]","[hollywoodscandalsgossip, blogspot, com]"


In [12]:
data['text_sent'] = data['text_stemmed'].map(lambda l: ' '.join(l))
data.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
547598,185.163.127.231/bins/mirai.mpsl,bad,"[bins, mirai, mpsl]","[bin, mirai, mpsl]",bin mirai mpsl
208800,local.yahoo.com/info-16324013-savoy-bar-grill-...,good,"[local, yahoo, com, info, savoy, bar, grill, s...","[local, yahoo, com, info, savoy, bar, grill, s...",local yahoo com info savoy bar grill saginaw
520105,mxp1135.com,bad,"[mxp, com]","[mxp, com]",mxp com
470320,youthink.ca/yt/fun/hey-it%E2%80%99s-may-may-da...,good,"[youthink, ca, yt, fun, hey, it, E, s, may, ma...","[youthink, ca, yt, fun, hey, it, e, s, may, ma...",youthink ca yt fun hey it e s may may day and ...
331707,facebook.com/people/Glenn-Burke/1108032055,good,"[facebook, com, people, Glenn, Burke]","[facebook, com, peopl, glenn, burk]",facebook com peopl glenn burk


In [13]:
bad= data[data.Label == 'bad']
good= data[data.Label == 'good']
bad.head()

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad,"[nobell, it, ffb, d, dca, cce, f, login, SkyPe...","[nobel, it, ffb, d, dca, cce, f, login, skype,...",nobel it ffb d dca cce f login skype com en cg...
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad,"[www, dghjdgf, com, paypal, co, uk, cycgi, bin...","[www, dghjdgf, com, paypal, co, uk, cycgi, bin...",www dghjdgf com paypal co uk cycgi bin webscrc...
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad,"[serviciosbys, com, paypal, cgi, bin, get, int...","[serviciosbi, com, paypal, cgi, bin, get, into...",serviciosbi com paypal cgi bin get into herf s...
3,mail.printakid.com/www.online.americanexpress....,bad,"[mail, printakid, com, www, online, americanex...","[mail, printakid, com, www, onlin, americanexp...",mail printakid com www onlin americanexpress c...
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad,"[thewhiskeydregs, com, wp, content, themes, wi...","[thewhiskeydreg, com, wp, content, theme, wide...",thewhiskeydreg com wp content theme widescreen...


In [14]:
good.head()

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_sent
18231,esxcc.com/js/index.htm?us.battle.net/noghn/en/...,good,"[esxcc, com, js, index, htm, us, battle, net, ...","[esxcc, com, js, index, htm, us, battl, net, n...",esxcc com js index htm us battl net noghn en r...
18232,wwweira¯&nvinip¿ncH¯wVö%ÆåyDaHðû/ÏyEùuË\nÓ6...,good,"[www, eira, nvinip, ncH, wV, yDaH, yE, u, rT, ...","[www, eira, nvinip, nch, wv, ydah, ye, u, rt, ...",www eira nvinip nch wv ydah ye u rt u g m i xz...
18233,'www.institutocgr.coo/web/media/syqvem/dk-óij...,good,"[www, institutocgr, coo, web, media, syqvem, d...","[www, institutocgr, coo, web, media, syqvem, d...",www institutocgr coo web media syqvem dk ij r ...
18234,YìêkoãÕ»Î§DéÎl½ñ¡ââqtò¸/à; Í,good,"[Y, ko, D, l, qt]","[y, ko, d, l, qt]",y ko d l qt
18236,ruta89fm.com/images/AS@Vies/1i75cf7b16vc<Fd16...,good,"[ruta, fm, com, images, AS, Vies, i, cf, b, vc...","[ruta, fm, com, imag, as, vie, i, cf, b, vc, f...",ruta fm com imag as vie i cf b vc f d b g sd v...


In [15]:
cv = CountVectorizer()
feature = cv.fit_transform(data.text_sent) 
feature[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
trainX, testX, trainY, testY = train_test_split(feature,data.Label)

In [17]:
#Logistic Regression model
lr = LogisticRegression()
lr.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))

Training Accuracy : 0.9768621559237783
Testing Accuracy : 0.9630616658293104


In [19]:
y_pred = lr.predict(testX)
print("Confusion Matrix:\n", confusion_matrix(testY, y_pred))

Confusion Matrix:
 [[35445  3825]
 [ 1248 96819]]


In [20]:
#Classificaton report
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

              precision    recall  f1-score   support

         Bad       0.90      0.97      0.93     36693
        Good       0.99      0.96      0.97    100644

    accuracy                           0.96    137337
   macro avg       0.94      0.96      0.95    137337
weighted avg       0.96      0.96      0.96    137337



In [21]:
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())
trainX, testX, trainY, testY = train_test_split(data.URL, data.Label)
pipeline_ls.fit(trainX,trainY)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
pickle.dump(pipeline_ls,open('phishing.pkl','wb'))
predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad','tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
predict_good = ['youtube.com/','youtube.com/watch?v=qI0TQJI3vdU','retailhellunderground.com/','restorevisioncenters.com/html/technology.html']
loaded_model = pickle.load(open('phishing.pkl', 'rb'))
result = loaded_model.predict(predict_bad)
result2 = loaded_model.predict(predict_good)
print(result)
print(result2)

['bad' 'bad' 'bad' 'bad']
['good' 'good' 'good' 'good']
