In [1]:
import os
import glob
import pandas as pd

## Read Data

In [2]:
os.chdir('./data/')
extension = 'parquet'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [3]:
df = pd.concat([pd.read_parquet(f) for f in all_filenames ])
df.shape

(67595, 3)

In [4]:
df.iloc[20]

url       https://www.cdiscountpro.com/bricolage/sanitai...
target                         [951, 1731, 1721, 1730, 830]
day                                                       4
Name: 20, dtype: object

In [5]:
df.columns

Index(['url', 'target', 'day'], dtype='object')

In [6]:
df.isna().any()

url       False
target    False
day       False
dtype: bool

In [7]:
df = df.explode('target')
df

Unnamed: 0,url,target,day
0,https://www.societe.com/societe/madame-karine-...,329,4
0,https://www.societe.com/societe/madame-karine-...,1234,4
0,https://www.societe.com/societe/madame-karine-...,5183,4
0,https://www.societe.com/societe/madame-karine-...,96,4
0,https://www.societe.com/societe/madame-karine-...,377,4
...,...,...,...
13514,https://www.justwatch.com/fr/film/les-tuche,1310,15
13514,https://www.justwatch.com/fr/film/les-tuche,1095,15
13514,https://www.justwatch.com/fr/film/les-tuche,211,15
13514,https://www.justwatch.com/fr/film/les-tuche,1094,15


## Features extraction

In [8]:
import numpy as np
import whois
from pyquery import PyQuery
from requests import get
import math
import ipaddress as ip

In [9]:
class UrlFeaturesExtractor(object):
    
    def __init__(self, url):
        
        self.domain = url.split('//')[-1].split('/')[0]
        self.url = url

        #try:
        #    self.whois = whois.query(self.domain).__dict__
        #except:
        #    self.whois = None

        #try:
        #    self.response = get(self.url)
        #    self.pq = PyQuery(self.response.text)
        #except:
        #    self.response = None
        #    self.pq = None
            
            
        
        string = self.url.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        self.entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        
        
        #try:
        #    if ip.ip_address(self.url):
        #        self.ip = 1
        #except:
        #    self.ip = 0
            
            
        
        digits = [i for i in self.url if i.isdigit()]
        self.numDigits = len(digits)
        
        self.urlLength =  len(self.url)
        
        
        self.numParameters = len(self.url.split('&')) - 1
        
        
        self.numFragments = len(self.url.split('#')) - 1
        
        self.numSubDomains = len(self.url.split('http')[-1].split('//')[-1].split('/')) -1
        
        self.domainExtension = self.url.split('.')[-1].split('/')[0]
        
        self.hasHttp = 'http:' in self.url
         
        self.hasHttps = 'https:' in self.url
        
        self.countDots = self.url.count('.')
        
        
        
        #Scrapping
        '''
        if self.pq is not None:
            self.bodyLength =  len(self.pq('html').text()) 
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            self.numTitles =  len([item for s in titles for item in s])
            self.numImages =  len([i for i in self.pq('img').items()])
            self.scriptLength = len(self.pq('script').text())
            self.numLinks =  len([i for i in self.pq('a').items()])
        else:
            self.numTitles = 0
            self.bodyLength = 0
            self.numLinks = 0
            self.numImages = 0
            self.scriptLength = 0
            
        
        '''

In [10]:
def urlToCsventry(url):
    url_extractor = UrlFeaturesExtractor(url)
    return url_extractor.url, url_extractor.domain, url_extractor.entropy, url_extractor.numDigits, url_extractor.numParameters, url_extractor.urlLength, url_extractor.numFragments, url_extractor.numSubDomains, url_extractor.domainExtension, url_extractor.hasHttp, url_extractor.hasHttps, url_extractor.countDots 


In [11]:
#target_occurence = dict(df['target'].value_counts())

In [12]:
column_names = ["url", "domain", "entropy", "numDigits", "numParameters", "urlLength","numFragments", "subDomains", "domainExtension", "hasHttp", "hasHttps", "countDots "]

features = pd.DataFrame(columns = column_names)

In [13]:
df.reset_index(drop = True, inplace=True)

In [None]:
for index, row in df.iterrows():
    features.loc[index] = urlToCsventry(row['url'])


In [13]:
features

Unnamed: 0,url,target,day
13500,https://www.express.co.uk/travel/articles/1087...,"[433, 702, 5140, 683, 1288]",16
13501,https://chefsimon.com/recettes/tag/g%C3%A9latine,"[1513, 907, 906, 1573, 1526]",16
13502,https://www.cdiscount.com/chaussures/nouveau-m...,"[997, 1614, 697, 1488, 1490]",1
13503,https://www.cdiscount.com/search/10/console+sw...,"[925, 1046, 1146, 1043, 1311]",16
13504,http://www.allocine.fr/video/player_gen_cmedia...,"[1107, 1096, 615, 211, 1097]",1
...,...,...,...
13595,https://www.lemonde.fr/m-perso/article/2014/12...,"[1260, 1133, 1135, 1132]",1
13596,https://www.lemonde.fr/archives-du-monde/19-01...,[16],1
13597,https://jardinage.ooreka.fr/astuce/voir/634575...,"[981, 269, 751, 908, 1752]",16
13598,https://www.mynet.com/liseli-didem-den-4-gundu...,"[301, 16]",16


In [19]:
df.apply(lambda row: print(row))

0        https://www.societe.com/societe/madame-karine-...
1        https://www.ebay-kleinanzeigen.de/s-nu%C3%9Fba...
2        https://psychologie.aufeminin.com/forum/mon-co...
3         https://fr.shopping.rakuten.com/s/powerone+pr+70
4        https://www.cdiscount.com/search/10/coque+Sams...
                               ...                        
67590    http://www.allocine.fr/film/fichefilm-44892/cr...
67591    https://www.jpost.com/J-Spot/The-top-10-photog...
67592       http://fr.viadeo.com/fr/profile/laetitia.ricol
67593    https://qualite.ooreka.fr/astuce/voir/635469/a...
67594          https://www.justwatch.com/fr/film/les-tuche
Name: url, Length: 67595, dtype: object
0            [329, 1234, 5183, 96, 377]
1            [158, 650, 1175, 831, 953]
2            [325, 253, 1775, 640, 543]
3              [1143, 210, 531, 18, 41]
4        [1171, 1071, 1192, 1533, 1277]
                      ...              
67590          [1107, 1101, 1094, 1095]
67591                          

url       None
target    None
day       None
dtype: object

In [15]:
features.shape

(67595, 12)

In [20]:
features['target'] = df['target']

In [23]:
features

Unnamed: 0,url,domain,entropy,numDigits,numParameters,urlLength,numFragments,subDomains,domainExtension,hasHttp,hasHttps,countDots,target
0,https://www.societe.com/societe/madame-karine-...,www.societe.com,-4.482191,9,0,68,0,2,html,False,True,3,"[329, 1234, 5183, 96, 377]"
1,https://www.ebay-kleinanzeigen.de/s-nu%C3%9Fba...,www.ebay-kleinanzeigen.de,-4.581834,3,0,56,0,2,de,False,True,2,"[158, 650, 1175, 831, 953]"
2,https://psychologie.aufeminin.com/forum/mon-co...,psychologie.aufeminin.com,-4.516403,7,0,96,0,2,com,False,True,2,"[325, 253, 1775, 640, 543]"
3,https://fr.shopping.rakuten.com/s/powerone+pr+70,fr.shopping.rakuten.com,-4.264461,2,0,48,0,2,com,False,True,3,"[1143, 210, 531, 18, 41]"
4,https://www.cdiscount.com/search/10/coque+Sams...,www.cdiscount.com,-4.660776,4,0,65,0,3,html,False,True,3,"[1171, 1071, 1192, 1533, 1277]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67590,http://www.allocine.fr/film/fichefilm-44892/cr...,www.allocine.fr,-4.383853,5,0,65,0,4,fr,True,False,2,"[1107, 1101, 1094, 1095]"
67591,https://www.jpost.com/J-Spot/The-top-10-photog...,www.jpost.com,-4.328641,16,0,82,0,2,com,False,True,2,[408]
67592,http://fr.viadeo.com/fr/profile/laetitia.ricol,fr.viadeo.com,-3.893264,0,0,46,0,3,ricol,True,False,3,"[1481, 372, 799, 5182, 327]"
67593,https://qualite.ooreka.fr/astuce/voir/635469/a...,qualite.ooreka.fr,-4.438132,6,0,66,0,4,fr,False,True,2,"[720, 338, 1162, 331, 157]"


In [None]:
import pickle

with open('lexicalFeatures', 'wb') as f:
    # Step 3
    pickle.dump(features, f)