In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data.csv")

In [3]:
data_phish = pd.read_csv('verified_online - verified_online.csv')

In [4]:
clean_phish = data_phish.loc[:,['url','verified']]

In [5]:
clean_phish.columns = ['url', 'label']

In [6]:
clean_phish['label'] = clean_phish['label'].replace('yes','1')
clean_phish['label'] = clean_phish['label'].apply(np.int64)

In [7]:
final_data = pd.concat([data, clean_phish], axis=0)

In [8]:
final_data.head()

Unnamed: 0,url,label
0,http://br-ofertasimperdiveis.epizy.com/produto...,1
1,https://semana-da-oferta.com/produtos.php?id=5...,1
2,https://scrid-apps-creacust-sslhide90766752024...,1
3,http://my-softbank-security.com/wap_login.htm,1
4,http://www.my-softbank-security.com/wap_login.htm,1


In [9]:
final_data['label'].value_counts()

0    1000000
1      73049
Name: label, dtype: int64

In [10]:
# final_data = final_data.sample(frac=1)

In [11]:
final_data.head()

Unnamed: 0,url,label
0,http://br-ofertasimperdiveis.epizy.com/produto...,1
1,https://semana-da-oferta.com/produtos.php?id=5...,1
2,https://scrid-apps-creacust-sslhide90766752024...,1
3,http://my-softbank-security.com/wap_login.htm,1
4,http://www.my-softbank-security.com/wap_login.htm,1


In [12]:
final_data['url'].str.split("://").head()

0    [http, br-ofertasimperdiveis.epizy.com/produto...
1    [https, semana-da-oferta.com/produtos.php?id=5...
2    [https, scrid-apps-creacust-sslhide90766752024...
3       [http, my-softbank-security.com/wap_login.htm]
4    [http, www.my-softbank-security.com/wap_login....
Name: url, dtype: object

In [13]:
seperation_of_protocol = final_data['url'].str.split("://", expand=True)
seperation_of_protocol.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,http,br-ofertasimperdiveis.epizy.com/produto.php?li...,,,,,,,,,,,,,,,,
1,https,semana-da-oferta.com/produtos.php?id=5abad0c01...,,,,,,,,,,,,,,,,
2,https,scrid-apps-creacust-sslhide90766752024.cread-s...,,,,,,,,,,,,,,,,
3,http,my-softbank-security.com/wap_login.htm,,,,,,,,,,,,,,,,
4,http,www.my-softbank-security.com/wap_login.htm,,,,,,,,,,,,,,,,


In [14]:
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand=True)

In [15]:
seperation_domain_name.head()

Unnamed: 0,0,1
0,br-ofertasimperdiveis.epizy.com,produto.php?linkcompleto=iphone-6-plus-apple-6...
1,semana-da-oferta.com,produtos.php?id=5abad0c01d149
2,scrid-apps-creacust-sslhide90766752024.cread-s...,hider_reo/
3,my-softbank-security.com,wap_login.htm
4,www.my-softbank-security.com,wap_login.htm


In [16]:
splitted_data = pd.concat([seperation_of_protocol[0], seperation_domain_name], axis=1)

In [17]:
splitted_data.columns = ['protocol', 'domain', 'address']

In [18]:
splitted_data.head()

Unnamed: 0,protocol,domain,address
0,http,br-ofertasimperdiveis.epizy.com,produto.php?linkcompleto=iphone-6-plus-apple-6...
1,https,semana-da-oferta.com,produtos.php?id=5abad0c01d149
2,https,scrid-apps-creacust-sslhide90766752024.cread-s...,hider_reo/
3,http,my-softbank-security.com,wap_login.htm
4,http,www.my-softbank-security.com,wap_login.htm


In [19]:
splitted_data['is_phished']=pd.Series(final_data['label'], index=splitted_data.index)

In [20]:
splitted_data.head()

Unnamed: 0,protocol,domain,address,is_phished
0,http,br-ofertasimperdiveis.epizy.com,produto.php?linkcompleto=iphone-6-plus-apple-6...,1
1,https,semana-da-oferta.com,produtos.php?id=5abad0c01d149,1
2,https,scrid-apps-creacust-sslhide90766752024.cread-s...,hider_reo/,1
3,http,my-softbank-security.com,wap_login.htm,1
4,http,www.my-softbank-security.com,wap_login.htm,1


### Feature Extraction

Feature 1: (long URL)

> - Phishing urls uses long urls to hide suspicious part:
> - If the length of the URL is greater than or equal 54 characters then the URL classified as phishing
    - 0 -> legit
    - 1 -> phishing
    - 2 -> suspicious
    
Feature 2: ( @ symbol )

> - Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.
    - if url contain @
        - phishing (1)
    - else:
        - legit (0)

Feature 3: ( Redirecting using "//" symbol )

> - The existence of “//” within the URL path means that the user will be redirected to another website. An example of such URL’s is: “http://www.legitimate.com//http://www.phishing.com”. We examine the location where the “//” appears. We find that if the URL starts with “HTTP”, that means the “//” should appear in the sixth position. However, if the URL employs “HTTPS” then the “//” should appear in seventh position.
    - if position of the Last Occurrence of "//" in the URL > 7
        - phishing (1)
    - else:
        - legit (0)
        
Fearure 4: (Adding Prefix or Suffix Separated by (-) to the Domain)

> - The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.
    Ex - http://www.confirme-paypal.com
    - 
    - if url contain (-):
        - phishing (1)
    - else:
        - legit (0)
        
Feature 5: ( Sub-Domain and Multi Sub-Domains )
> - The legitimate URL link has two dots in the URL since we can ignore typing “www.”. If the number of dots is equal to three then the URL is classified as “Suspicious” since it has one sub-domain. However, if the dots are greater than three it is classified as “Phishy” since it will have multiple sub-domains
    - 0 -> legit
    - 1 -> phishing
    - 2 -> suspicious


In [21]:
# Feature 1
def long_url(url):
    url = str(url)
    
    if len(url) < 54:
        return 0
    elif len(url) >= 54 and len(url) <=75:
        return 2
    return 1

# Feature 2
def have_at_symbol(url):
    if '@' in str(url):
        return 1
    return 0

# Feature 3
def redirection(url):
    if "//" in str(url):
        return 1
    return 0

# Feature 4
def prefix_suffix_seperation(url):
    if '-' in str(url):
        return 1
    return 0

# Feature 5
def sub_domain(url):
    url = str(url)
    if url.count('.') < 3:
        return 0
    elif url.count('.') == 3:
        return 2
    return 1

In [22]:
splitted_data['long_url'] = final_data['url'].apply(long_url)
splitted_data['@_symbol'] = final_data['url'].apply(have_at_symbol)
splitted_data['//_symbol'] = seperation_of_protocol[1].apply(redirection)
splitted_data['prefix_suffix_seperation'] = splitted_data['domain'].apply(prefix_suffix_seperation)
splitted_data['sub_domain'] = splitted_data['domain'].apply(sub_domain)

splitted_data.head()

Unnamed: 0,protocol,domain,address,is_phished,long_url,@_symbol,//_symbol,prefix_suffix_seperation,sub_domain
0,http,br-ofertasimperdiveis.epizy.com,produto.php?linkcompleto=iphone-6-plus-apple-6...,1,1,0,0,1,0
1,https,semana-da-oferta.com,produtos.php?id=5abad0c01d149,1,2,0,0,1,0
2,https,scrid-apps-creacust-sslhide90766752024.cread-s...,hider_reo/,1,2,0,0,1,0
3,http,my-softbank-security.com,wap_login.htm,1,0,0,0,1,0
4,http,www.my-softbank-security.com,wap_login.htm,1,0,0,0,1,0


In [23]:
splitted_data['is_phished'].value_counts()

0    1000000
1      73049
Name: is_phished, dtype: int64

In [24]:
phis_data_0 = splitted_data[splitted_data['is_phished'] == 1].iloc[0:73049,:]
phis_data_1 = splitted_data[splitted_data['is_phished'] == 0].iloc[0:73049,:]
phis_data = pd.concat([phis_data_0, phis_data_1], axis=0)

In [25]:
phis_data.head()

Unnamed: 0,protocol,domain,address,is_phished,long_url,@_symbol,//_symbol,prefix_suffix_seperation,sub_domain
0,http,br-ofertasimperdiveis.epizy.com,produto.php?linkcompleto=iphone-6-plus-apple-6...,1,1,0,0,1,0
1,https,semana-da-oferta.com,produtos.php?id=5abad0c01d149,1,2,0,0,1,0
2,https,scrid-apps-creacust-sslhide90766752024.cread-s...,hider_reo/,1,2,0,0,1,0
3,http,my-softbank-security.com,wap_login.htm,1,0,0,0,1,0
4,http,www.my-softbank-security.com,wap_login.htm,1,0,0,0,1,0


In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [33]:
cv = CountVectorizer(ngram_range=(1,1))

In [40]:
phis_data['domain'] = phis_data['domain'].replace('None','empty')

In [41]:
x = phis_data['domain']

In [42]:
x_train = cv.fit_transform(x)

AttributeError: 'NoneType' object has no attribute 'lower'

In [26]:
phis_data['is_phished'].value_counts()

1    73049
0    73049
Name: is_phished, dtype: int64

In [27]:
phis_data = phis_data.sample(frac=1)

In [28]:
phis_data.head()

Unnamed: 0,protocol,domain,address,is_phished,long_url,@_symbol,//_symbol,prefix_suffix_seperation,sub_domain
47480,http,dropbox.com,s/pknitzr27a2k565,1,0,0,0,0,0
31932,http,skaiyacouture.com,wp-admin/main/mail.php,1,0,0,0,0,0
10931,http,www.funerariasinop.com.br,authenticate/T-mobile/Telekom_cloudstorage/T-o...,1,1,0,0,0,2
2206,http,www.vineyard-garden.com,images/12364fef5225b1d42a3e76a2e0443745,1,2,0,0,1,0
104315,un-documents.net,,,0,0,0,0,0,0


In [29]:
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve, fbeta_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [30]:
X = phis_data.columns[4:9]
y = pd.factorize(phis_data['is_phished'])[0]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(phis_data[X], y, test_size=0.3, random_state=42)


In [35]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
print("Logisic regression done!")


knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
print("KNN Done!")

svm = LinearSVC()
svm.fit(X_train,y_train)
print("Support Vector Machine!")

randomforest = RandomForestClassifier(n_estimators=100,n_jobs=2,random_state=42)
randomforest.fit(X_train,y_train)
print("Random Forest Done!")

Logisic regression done!
KNN Done!
Support Vector Machine!
Random Forest Done!


In [36]:
lr_score = f1_score(lr.predict(X_test), y_test)
knn_score = f1_score(knn.predict(X_test), y_test)
svm_score = f1_score(svm.predict(X_test), y_test)
random_score = f1_score(randomforest.predict(X_test), y_test)


print("Lr Score is {}".format(lr_score))
print("knn Score is {}".format(knn_score))
print("svm Score is {}".format(svm_score))
print("randomforestt Score is {}".format(random_score))

Lr Score is 0.7230419315332128
knn Score is 0.7230419315332128
svm Score is 0.7230419315332128
randomforestt Score is 0.7230419315332128


NameError: name 'arr' is not defined