In [1]:
# Basic Libraries to be installed before moving ahead
!pip install pysafebrowsing
!pip install tld
!pip install whois
!pip install geoip2

Collecting pysafebrowsing
  Downloading pysafebrowsing-0.1.1-py3-none-any.whl (5.7 kB)
Installing collected packages: pysafebrowsing
Successfully installed pysafebrowsing-0.1.1
Collecting tld
  Downloading tld-0.12.1-py37-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 4.6 MB/s 
[?25hInstalling collected packages: tld
Successfully installed tld-0.12.1
Collecting whois
  Downloading whois-0.9.7.tar.gz (8.2 kB)
Building wheels for collected packages: whois
  Building wheel for whois (setup.py) ... [?25ldone
[?25h  Created wheel for whois: filename=whois-0.9.7-py3-none-any.whl size=8886 sha256=1beeae2d171ae43103d1f32fd5b83b40457d559d153cd02ee7a626b8bcdbe761
  Stored in directory: /root/.cache/pip/wheels/e9/7f/be/01c1c2954a2dbcc552cd957b9d6ce6bb5337009a7f8c0d4bc7
Successfully built whois
Installing collected packages: whois
Successfully installed whois-0.9.7
Collecting geoip2
  Downloading geoip2-3.0.0-py2.py3-none-any.whl (23 kB)
Collecting maxminddb>=1.5.2
  Do

In [2]:
# Basic Initialisation
import time
import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment', None) #Switch off warning

In [3]:
#Verifying pathname of dataset before loading
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename));
        print(os.listdir("../input"))

/kaggle/input/preprocessing-sample-data/processed_file_3.csv
['preprocessing-sample-data', 'geolite2countrymmdb']
/kaggle/input/geolite2countrymmdb/GeoLite2-Country.mmdb
['preprocessing-sample-data', 'geolite2countrymmdb']


In [4]:
def loadDataset():
    df = pd.read_csv("/kaggle/input/preprocessing-sample-data/processed_file_3.csv")
    return df

df = loadDataset()
df= df[['url']]
df= df.iloc[0:15,]
df

Unnamed: 0,url
0,diaryofagameaddict.com
1,espdesign.com.au
2,iamagameaddict.com
3,kalantzis.net
4,slightlyoffcenter.net
5,toddscarwash.com
6,tubemoviez.com
7,rupor.info
8,sn-gzzx.com
9,outporn.com


In [5]:
#Adding columns to the dataframe df
df['ip_add']=""
df['geo_loc']=""
df['tld']=""
df['who_is']=""
df['https']=""
df['label']=""
df = df[['url','ip_add','geo_loc','tld','who_is','https','label']]
df

Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,,,,,,
1,espdesign.com.au,,,,,,
2,iamagameaddict.com,,,,,,
3,kalantzis.net,,,,,,
4,slightlyoffcenter.net,,,,,,
5,toddscarwash.com,,,,,,
6,tubemoviez.com,,,,,,
7,rupor.info,,,,,,
8,sn-gzzx.com,,,,,,
9,outporn.com,,,,,,


In [6]:
# Filling the ip_add & geo_loc column of dataframe 
import os
import geoip2.database
import socket
import time

reader = geoip2.database.Reader('/kaggle/input/geolite2countrymmdb/GeoLite2-Country.mmdb')

for x in df.index:
    url = df['url'][x]
    
    try:
        ip_add = socket.gethostbyname(url)
        response = reader.country(ip_add)
        df['geo_loc'][x] = response.country.name
        df['ip_add'][x] = ip_add
        #print(x, "Finished")
        
    except Exception as msg:
        df['geo_loc'][x] = ""
        df['ip_add'][x] = ""
        #print(x," Finished with Error Msg:",msg)

reader.close()

df

Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,208.76.86.152,United States,,,,
1,espdesign.com.au,,,,,,
2,iamagameaddict.com,208.76.86.152,United States,,,,
3,kalantzis.net,208.88.6.80,Canada,,,,
4,slightlyoffcenter.net,208.76.80.77,United States,,,,
5,toddscarwash.com,23.227.173.218,United States,,,,
6,tubemoviez.com,104.27.131.151,United States,,,,
7,rupor.info,5.9.82.114,Germany,,,,
8,sn-gzzx.com,,,,,,
9,outporn.com,67.227.226.240,United States,,,,


In [7]:
#Filling up TLD column
from tld import get_tld

for x in df.index:
        
    try:
        u = df.url[x]
        s = get_tld(str(u), fix_protocol=True)
        df['tld'][x] = s
    except:
        pass
df

Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,208.76.86.152,United States,com,,,
1,espdesign.com.au,,,com.au,,,
2,iamagameaddict.com,208.76.86.152,United States,com,,,
3,kalantzis.net,208.88.6.80,Canada,net,,,
4,slightlyoffcenter.net,208.76.80.77,United States,net,,,
5,toddscarwash.com,23.227.173.218,United States,com,,,
6,tubemoviez.com,104.27.131.151,United States,com,,,
7,rupor.info,5.9.82.114,Germany,info,,,
8,sn-gzzx.com,,,com,,,
9,outporn.com,67.227.226.240,United States,com,,,


In [8]:
#Whois processing
import whois
start_time = time.time()

for x in df.iloc[0:1000].index:
    
    try:    
        domain = whois.query(df['url'][x])
        #print(domain.registrar)
        if len(str(domain.registrar)) >1 :
            df['who_is'][x]= True
        else:
            df['who_is'][x]= False
    except Exception as msg:
        #print(x,", Error: ",msg)
        df['who_is'][x]= False
    print(x,df['who_is'][x])

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))
df

0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
***Total Time taken --- 0.06841444969177246 seconds ---***


Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,208.76.86.152,United States,com,False,,
1,espdesign.com.au,,,com.au,False,,
2,iamagameaddict.com,208.76.86.152,United States,com,False,,
3,kalantzis.net,208.88.6.80,Canada,net,False,,
4,slightlyoffcenter.net,208.76.80.77,United States,net,False,,
5,toddscarwash.com,23.227.173.218,United States,com,False,,
6,tubemoviez.com,104.27.131.151,United States,com,False,,
7,rupor.info,5.9.82.114,Germany,info,False,,
8,sn-gzzx.com,,,com,False,,
9,outporn.com,67.227.226.240,United States,com,False,,


In [9]:
# Filling the column https_status
import http.client

start_time = time.time()

for x in df.index:
    https_status= False
    try:
        conn = http.client.HTTPSConnection(df['url'][x])
        conn.request("HEAD", "/")
        res = conn.getresponse()
        if res.status == 200 or res.status==301 or res.status==302:
            https_status= True   
        #print(x,res.status,res.reason,https_status)
    except Exception as msg:
        print(x,"Error: ",msg)
    finally:
        df['https'][x]= https_status
        conn.close

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))

df

0 Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'diaryofagameaddict.com'. (_ssl.c:1076)
1 Error:  [Errno -2] Name or service not known
2 Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'iamagameaddict.com'. (_ssl.c:1076)
4 Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'slightlyoffcenter.net'. (_ssl.c:1076)
8 Error:  [Errno -3] Temporary failure in name resolution
9 Error:  [Errno 111] Connection refused
11 Error:  [Errno 104] Connection reset by peer
12 Error:  [Errno 110] Connection timed out
13 Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1076)
***Total Time taken --- 47.288312673568726 seconds ---***


Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,208.76.86.152,United States,com,False,False,
1,espdesign.com.au,,,com.au,False,False,
2,iamagameaddict.com,208.76.86.152,United States,com,False,False,
3,kalantzis.net,208.88.6.80,Canada,net,False,True,
4,slightlyoffcenter.net,208.76.80.77,United States,net,False,False,
5,toddscarwash.com,23.227.173.218,United States,com,False,True,
6,tubemoviez.com,104.27.131.151,United States,com,False,True,
7,rupor.info,5.9.82.114,Germany,info,False,True,
8,sn-gzzx.com,,,com,False,False,
9,outporn.com,67.227.226.240,United States,com,False,False,


In [10]:
# Filling the label of training set from Google Safe Browising API
from pysafebrowsing import SafeBrowsing
KEY= "AIzaSyABO6DPGmHpCs8U5ii1Efkp1dUPJHQfGpo"

start_time = time.time()
s = SafeBrowsing(KEY)

for x in df.index:
    
    try:
        url = df['url'][x]
        r = s.lookup_urls([url])
        label=r[url]['malicious']    
        df['label']=label
        #print(x, label)
    except Exception as msg:
        df['label']=""
        #print(x,"Error: ",msg)

print("***Total Time taken --- %s seconds ---***" % (time.time() - start_time))

df

***Total Time taken --- 0.4455718994140625 seconds ---***


Unnamed: 0,url,ip_add,geo_loc,tld,who_is,https,label
0,diaryofagameaddict.com,208.76.86.152,United States,com,False,False,False
1,espdesign.com.au,,,com.au,False,False,False
2,iamagameaddict.com,208.76.86.152,United States,com,False,False,False
3,kalantzis.net,208.88.6.80,Canada,net,False,True,False
4,slightlyoffcenter.net,208.76.80.77,United States,net,False,False,False
5,toddscarwash.com,23.227.173.218,United States,com,False,True,False
6,tubemoviez.com,104.27.131.151,United States,com,False,True,False
7,rupor.info,5.9.82.114,Germany,info,False,True,False
8,sn-gzzx.com,,,com,False,False,False
9,outporn.com,67.227.226.240,United States,com,False,False,False


In [11]:
# Saving the file
#df.to_csv("Datasets/processed_1_14Jan20.csv")