In [208]:
import math
import re
import os
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
from urllib.parse import urlparse, urljoin
import pandas as pd
from sklearn import metrics
# import tensorflow as tf
from tensorflow.python.data import Dataset
from sklearn.preprocessing import StandardScaler

# tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [209]:
links_df = pd.read_csv("../../crawler/extract_links/dataset_product_pages.csv", delimiter=",", index_col=0)

In [210]:
# add new column domain
links_df['domain'] = links_df.apply(lambda row: urlparse(row['url']).netloc, axis=1)

# change order
links_df = links_df[['domain','url', 'label']]

#  map true/false to 0,1
links_df["label"] = links_df["label"].astype(int)

In [211]:
from time import time
from random import randint
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
import pickle


In [212]:
# remove duplicates
links_df.drop_duplicates(['url'], keep='first')

Unnamed: 0,domain,url,label
0,www.alfa-leky.cz,https://www.alfa-leky.cz/2869577-humer-hygiena...,1
1,www.rustak.cz,https://www.rustak.cz/pevne-zdravi,0
2,www.5xl.cz,https://www.5xl.cz/boxerky,0
3,www.stample.cz,https://www.stample.cz/Charles-Chaplin-Ondrej-...,1
4,www.marines-shop.com,https://www.marines-shop.com/obal-na-karimatku...,1
...,...,...,...
702,promo-pneumatiky.cz,https://promo-pneumatiky.cz/riken-b1414,0
705,promo-pneumatiky.cz,https://promo-pneumatiky.cz/gripmax-b1626,0
706,shop.merret.cz,http://shop.merret.cz/34-om-602uqc,0
707,eulift.cz,https://eulift.cz/paletove-voziky/paletove-voz...,0


In [213]:
are_pp = len(links_df[links_df['label'] == 1])
are_not_pp = len(links_df[links_df['label'] == 0])
print(f'Are product pages: {are_pp}')
print(f'Are not product pages: {are_not_pp}')

Are product pages: 377
Are not product pages: 334


In [214]:
# drop duplicates in url column
links_df = links_df.drop_duplicates(subset=['url'])
len(links_df)

630

In [215]:
links_df.head()

Unnamed: 0,domain,url,label
0,www.alfa-leky.cz,https://www.alfa-leky.cz/2869577-humer-hygiena...,1
1,www.rustak.cz,https://www.rustak.cz/pevne-zdravi,0
2,www.5xl.cz,https://www.5xl.cz/boxerky,0
3,www.stample.cz,https://www.stample.cz/Charles-Chaplin-Ondrej-...,1
4,www.marines-shop.com,https://www.marines-shop.com/obal-na-karimatku...,1


In [268]:
# added by Petr Hanzl from ml_utils.py
PATH_LEN_CAP=100
MIN_PROD_ID_LEN = 3
MAX_PROD_ID_LEN = 32

def has_product_id(path):
    if not re.search(r'\d', path):
        return 0
    for num in re.findall(r'[0-9]+', path):
        if len(num) > MIN_PROD_ID_LEN and len(num) < MAX_PROD_ID_LEN:
            return 1
    return 0

def preprocess_features(df, load_scaler_from_file=False):
    processed_features = df[["url"]].copy()
    processed_features["path"] = processed_features["url"].map(lambda x: urlparse(x).path + urlparse(x).params + urlparse(x).query + urlparse(x).fragment)
    processed_features["path_len"] = processed_features["path"].map(lambda x: min(len(x), PATH_LEN_CAP))
    processed_features["num_hyphen"] = processed_features["path"].map(lambda x: x.count("-") + x.rstrip("/").count("/"))
    processed_features["num_slash"] = processed_features["path"].map(lambda x: x.rstrip("/").count("/"))
    processed_features["contains_product"] = processed_features["path"].map(lambda x: 1 if "product" in x else 0)
    processed_features["contains_produkt"] = processed_features["path"].map(lambda x: 1 if "produkt" in x else 0)
    processed_features["contains_detail"] = processed_features["path"].map(lambda x: 1 if "detail" in x else 0)
    processed_features["contains_category"] = processed_features["path"].map(lambda x: 1 if "category" in x else 0)
    processed_features["contains_kategorie"] = processed_features["path"].map(lambda x: 1 if "kategorie" in x else 0)
    processed_features["longest_num"] = processed_features["path"].map(lambda x: len(max(re.findall(r'[0-9]+', x), key=len)) if re.search(r'\d', x) else 0)
    processed_features["contains_pid"] = processed_features["path"].map(lambda x: has_product_id(x))
    cols_to_drop = ['url', 'path']
    processed_features.drop(cols_to_drop, axis=1, inplace=True)
    scaled_features = processed_features.copy()
    col_names = [col for col in processed_features if col not in cols_to_drop and not "contains" in col]
    features = scaled_features[col_names]
    scaler_filename = 'StandardScaler.est'
    if load_scaler_from_file and os.path.isfile(scaler_filename):
        scaler = pickle.load(open(scaler_filename, 'rb'))
    else:
        scaler = StandardScaler()
        scaler = StandardScaler().fit(features.values)
        pickle.dump(scaler, open(scaler_filename, 'wb'))

    features = scaler.transform(features.values)
    scaled_features[col_names] = features
    return scaled_features

def preprocess_targets(df):
  """Prepares target features (i.e., labels) from California housing data set.

  Args:
    california_housing_dataframe: A Pandas DataFrame expected to contain data
      from the California housing data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  # Create a boolean categorical feature representing whether the
  # median_house_value is above a set threshold.
  output_targets["label"] = df["label"].astype(int)
  #output_targets["median_house_value_is_high"] = (
  #  california_housing_dataframe["median_house_value"] > 265000).astype(float)
  return output_targets

def get_groups(df):
  return df["domain"].values

In [269]:
# Choose the first 90% of the examples for training.
n_links = len(links_df)
train_len = int(math.floor(0.9*n_links))
validation_len = int(n_links - train_len)
print("train_len", train_len, "validation_len", validation_len)

training_input = links_df.head(train_len)
validation_input = links_df.tail(validation_len)

training_input = training_input.reindex(
    np.random.permutation(training_input.index))

validation_input = validation_input.reindex(
    np.random.permutation(validation_input.index))

training_examples = preprocess_features(training_input)
training_targets = preprocess_targets(training_input)
training_groups = get_groups(training_input)

# Choose the last 30% of the examples for validation.
validation_examples = preprocess_features(validation_input)
validation_targets = preprocess_targets(validation_input)

print("Training examples summary:")
display.display(training_examples.describe())
print("Validation examples summary:")
display.display(validation_examples.describe())

print("Training targets summary:")
display.display(training_targets.describe())
print("Validation targets summary:")
display.display(validation_targets.describe())

train_len 567 validation_len 63
Training examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,contains_product,contains_produkt,contains_detail,contains_category,contains_kategorie,longest_num
count,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0,567.0
mean,-0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
std,1.0,1.0,1.0,0.2,0.2,0.2,0.1,0.2,1.0
min,-1.6,-1.4,-1.6,0.0,0.0,0.0,0.0,0.0,-0.8
25%,-0.8,-0.9,-0.6,0.0,0.0,0.0,0.0,0.0,-0.8
50%,-0.2,-0.4,-0.6,0.0,0.0,0.0,0.0,0.0,-0.1
75%,0.7,0.7,0.5,0.0,0.0,0.0,0.0,0.0,0.7
max,2.7,3.9,6.9,1.0,1.0,1.0,1.0,1.0,4.2


Validation examples summary:


Unnamed: 0,path_len,num_hyphen,num_slash,contains_product,contains_produkt,contains_detail,contains_category,contains_kategorie,longest_num
count,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0,63.0
mean,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.1,-0.0
std,1.0,1.0,1.0,0.1,0.3,0.0,0.0,0.3,1.0
min,-1.3,-1.3,-1.9,0.0,0.0,0.0,0.0,0.0,-0.9
25%,-0.6,-0.5,-0.5,0.0,0.0,0.0,0.0,0.0,-0.9
50%,-0.3,-0.5,-0.5,0.0,0.0,0.0,0.0,0.0,-0.3
75%,0.2,0.4,0.9,0.0,0.0,0.0,0.0,0.0,0.9
max,4.1,4.0,3.7,1.0,1.0,0.0,0.0,1.0,3.1


Training targets summary:


Unnamed: 0,label
count,567.0
mean,0.6
std,0.5
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Validation targets summary:


Unnamed: 0,label
count,63.0
mean,0.1
std,0.4
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [270]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier


In [271]:
training_input.head()

Unnamed: 0,domain,url,label
436,www.happyrobin.cz,http://www.happyrobin.cz/survival-food-pack-2/,0
504,www.spectoys.cz,https://www.spectoys.cz/#customerLogin,0
74,www.limosky.cz,https://www.limosky.cz/napoje-bad-brambacher/6...,1
333,www.beemy.cz,https://www.beemy.cz/vyprodej/cepice-light-blu...,1
142,editsimon.cz,https://editsimon.cz/vitaminy-a-doplnky/171-ka...,1


In [272]:
gkf = GroupKFold(n_splits=5)

clf = LogisticRegression(solver="lbfgs", C=0.05, penalty="l2").fit(training_examples, training_targets.values.ravel())
sgd_clf = SGDClassifier(loss="log", max_iter=1000, eta0=0.0002, learning_rate="adaptive").fit(training_examples, training_targets.values.ravel())
sgd_clf = SGDClassifier(loss="log", max_iter=10000, alpha=0.01,
                        learning_rate="optimal").\
                        fit(training_examples, training_targets.values.ravel())
logit_scores = cross_val_score(clf, training_examples,
                               training_targets.values.ravel(),
                               cv=gkf, groups=training_groups)
sgd_scores = cross_val_score(sgd_clf, training_examples,
                             training_targets.values.ravel(),
                             cv=gkf, groups=training_groups)

print("Logit Accuracy: %0.2f (+/- %0.2f)" % (logit_scores.mean(), logit_scores.std() * 2))
print("SGD Accuracy: %0.2f (+/- %0.2f)" % (sgd_scores.mean(), sgd_scores.std() * 2))

print("Logit", "%0.2f" % clf.score(validation_examples, validation_targets))
print("SGD", "%0.2f" % sgd_clf.score(validation_examples, validation_targets))


Logit Accuracy: 0.78 (+/- 0.07)
SGD Accuracy: 0.78 (+/- 0.07)
Logit 0.65
SGD 0.62


In [273]:
model_filename = 'SGDClassifier.est'
pickle.dump(sgd_clf, open(model_filename, 'wb'))


In [274]:
sgd_est = pickle.load(open(model_filename, 'rb'))
sgd_est.score(validation_examples, validation_targets)

0.6190476190476191

In [275]:
urls= ["https://www.alza.cz/darkovy-poukaz-alza-cz-na-nakup-zbozi-v-hodnote-500-kc-d5258076.htm",
"https://www.czc.cz/the-last-of-us-remastered-ps4/151440/produkt",
"https://www.astir.cz/eshop/pokladni-kotoucky/papirove-kotoucky-s-kopii-11/kotoucek-ncr-114-70-1211",
"https://www.kupsiboty.cz/produkt/cerne-elasticke-kozacky-se-zlatym-podpatkem-claudia-ghizzani-36",
"https://www.dogsdream.cz/koberecky/trixie-junior-dog-activity-cmuchaci-koberec-38-cm/",
"https://www.srncuvkram.cz/mibiv-oem-navigace-semi-original-2/dvd-opticka-mechanika-sim-microsd-slot-pro-mibiv-autoradio/",
"https://www.medpharma.cz/vitaminy-mineraly-specialni-pripravky/probio-imun-komplex-laktobacilu-a-bifidobakterii",
"https://www.betulin.cz/produkt/betulin-pure/",
"https://www.rcmodelari.cz/jamara-cocoon-monstertruck-4wd/",
"https://www.mojeboccia.cz/panske/boccia-titanium-hodinky--3641-02/",
"https://www.beason.cz/balicky/extrifit-cesta-na-vrchol/",
"https://www.ecstore.cz/detail/lezecke-vybaveni/ocun-via-ferrata-set-bodyguard/",
"https://www.obchod-cikom.cz/sada-vrtaku-a-bitu-26ks-darek-termohrnek-dewalt",
"http://www.jeep4x4shop.cz/wk-doplnky-tuning-vychytavky-c108/pruzina-s-tlumicem-a-uchycenim-predni-leva-jeep-wk-wh-xk-xh-i3181/",
"https://www.onai.cz/bosch-tassimo-style-tas1102/",
"https://www.kusove-koberce.cz/kusovy-koberec-star-19112-53-blue/pro4420.html",
"https://www.mimibertik.cz/prebalovaci-podlozka-mekka-50x70-denim-style-dream-catcher-blue-ceba-baby/",
"https://www.svodice-prepeti.cz/Kombinovany-svodic-prepeti-CITEL-DUT250VG-300-TT-d10.htm",
"https://www.celomed.cz/Sweet-orange-aromaterapeuticka-esence-d908.htm",
"https://www.vlastni-znamky.cz/stojanek-na-ubrousky-litinovy-k1479-1-ks.html",
"https://www.astir.cz/eshop/pokladni-kotoucky/papirove-kotoucky-s-kopii-11/kotoucek-ncr-114-70-1211",
"https://www.mojetvoreni.cz/latky/pradlo.php?m=Bavlněná%20látka%20Bagatelle%20circles%20coral%20digital%20print",
"http://www.nerezsanita.cz/Pitko-Franke-DF240WM-d1336.htm",
"https://www.obraznastenu.cz/obraz-na-platne-white-poppy-1-dilny-xobverart014e1",
"https://trnkovjanka-eshop.webnode.cz/products/produkt-1/",
"http://www.pohodadomova.sk/kotol-plynovy-zavesny/intergas-kotol-kondenzacny-hre-18-kw-046068/",
"http://eshop.virutekk.cz/sklopne-kotle-s-michadlem/virutekk-s-michadlem-praktik-350-multifunkcni-kotel/",
"https://www.skisport-shop.cz/bezky-na-klasiku/728-bezecke-lyze-sporten-perun-198-cm-supiny.html",
"http://viacell.cz/produkty/446",
"https://www.floraldesign.cz/Eucalyptus-Spiral-listy-150gr-bordo-d6441.htm",
"https://www.bemondi.sk/sk/karta/koen-kreslo-manager-,694.html",
"https://www.topventilatory.cz/ventilatory-silentis-nerez/cata-silentis-10-inox-186",
"https://www.essimo.cz/p/gelatina-2000-fair-power",
"https://www.sharp-partner.cz/regalove-reproduktory/519-bluetooth-reproduktory-eltax-monitor-iii-bt-cerna.html",
"https://www.identcore.cz/rfid-cipove-klicenky/potisknutelna-klicenka-epoxy/potisknutelna-klicenka-epoxy-151.html"]

df = pd.DataFrame.from_records([(url,) for url in urls], columns=["url"])
X = preprocess_features(df, load_scaler_from_file=True)
sgd_est = pickle.load(open(model_filename, 'rb'))
probs = sgd_est.predict_proba(X.values)
for url, probs in zip(urls, probs):
    if probs[1] < 0.55:
        print(url, probs[1])


https://www.betulin.cz/produkt/betulin-pure/ 0.28131774925153075
https://trnkovjanka-eshop.webnode.cz/products/produkt-1/ 0.3133547644033269
http://viacell.cz/produkty/446 0.2190479494117698
