In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
#!pip install node2vec

Collecting node2vec
  Downloading https://files.pythonhosted.org/packages/4a/99/62a34f4c2e76a100eca742d2f909c8c18afdb68c47148a3835f94bba76a3/node2vec-0.3.0.tar.gz
Collecting gensim (from node2vec)
[?25l  Downloading https://files.pythonhosted.org/packages/82/bb/56f295a604dfafdef746cc81081ff4c6e825690de95963000300a1cd3d80/gensim-3.7.3-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.7MB)
[K     |████████████████████████████████| 24.7MB 697kB/s eta 0:00:01
Collecting tqdm (from node2vec)
  Using cached https://files.pythonhosted.org/packages/45/af/685bf3ce889ea191f3b916557f5677cc95a5e87b2fa120d74b5dd6d049d0/tqdm-4.32.1-py2.py3-none-any.whl
Collecting smart-open>=1.7.0 (from gensim->node2vec)
[?25l  Downloading https://files.pythonhosted.org/packages/bf/ba/7eaf3c0dbe601c43d88e449dcd7b61d385fe07c0167163f63f58ece7c1b5/smart_open-1.8.3.tar.gz (60kB)
[K     |████████████████████████████████| 61kB 744kB/s eta 0:00:01
[?25hCol

In [10]:
import numpy as np
import os
import csv
import zipfile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import networkx as nx

from scipy import sparse

import lightgbm
from lightgbm import LGBMClassifier

import pickle 

import logging
logging.disable(logging.INFO)

# Read Data

In [5]:
TRAIN_FILE = "train.csv"
TEST_FILE = "test.csv"
GRAPH_FILE = 'edgelist.txt'


def get_hosts_and_target():
    train_hosts = list()
    y_train = list()
    with open(TRAIN_FILE, 'r') as f:
        for line in f:
            l = line.split(',')
            train_hosts.append(l[0])
            y_train.append(l[1][:-1])

    # Read test data
    test_hosts = list()
    with open(TEST_FILE, 'r') as f:
        for line in f:
            l = line.split(',')
            test_hosts.append(l[0])

    return train_hosts, test_hosts, y_train

def get_graph_edges(train_hosts, test_hosts):
    # Create a directed graph
    G = nx.read_edgelist('edgelist.txt', delimiter=' ', 
                         create_using=nx.DiGraph())

    print(G.number_of_nodes())
    print(G.number_of_edges())

    # Create the training matrix. Each row corresponds to a web host.
    # Use the following 3 features for each web host:
    # (1) out-degree of node
    # (2) in-degree of node
    # (3) average degree of neighborhood of node
    X_train_graph = np.zeros((len(train_hosts), 3))
    avg_neig_deg = nx.average_neighbor_degree(G, 
                                              nodes=train_hosts)
    for i in range(len(train_hosts)):
        X_train_graph[i,0] = G.in_degree(train_hosts[i])
        X_train_graph[i,1] = G.out_degree(train_hosts[i])
        X_train_graph[i,2] = avg_neig_deg[train_hosts[i]]

    # Create the test matrix. Use the same 3 features as above
    X_test_graph = np.zeros((len(test_hosts), 3))
    avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_hosts)
    for i in range(len(test_hosts)):
        X_test_graph[i,0] = G.in_degree(test_hosts[i])
        X_test_graph[i,1] = G.out_degree(test_hosts[i])
        X_test_graph[i,2] = avg_neig_deg[test_hosts[i]]

    print("Train matrix dimensionality: ", X_train_graph.shape)
    print("Test matrix dimensionality: ", X_test_graph.shape)
    
    return X_train_graph, X_test_graph, G

def get_text_tfidf(train_hosts, test_hosts):
    text = dict()
    filenames = os.listdir('domains/')
    for filename in filenames:
        if filename[-4:] == '.zip':
            z = zipfile.ZipFile('domains/'+filename)
            contents = z.namelist()
            text[filename[:-4]] = ''
            for c in contents:
                f = z.open(c)
                data = f.read()
                text[filename[:-4]] += data.decode('utf16') #for windows: latin1
                f.close()

    train_data = list()
    for host in train_hosts:
        if host in text:
            train_data.append(text[host])
        else:
            train_data.append('')


    # Create the training matrix. Each row corresponds to a webpage and each column 
    # to a word present in at least 10 webpages 
    # and at most 50 webpages. The value of each entry in a row is equal to 
    # the frequency of that word in the corresponding
    # webpage       
    vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', min_df=10, max_df=50)
    X_train_text = vec.fit_transform(train_data)

    # Read webpages of the test set
    test_data = list()
    for host in test_hosts:
        if host in text:
            test_data.append(text[host])
        else:
            test_data.append('')


    # Create the test matrix following the same approach as in the case of the training matrix
    X_test_text = vec.transform(test_data)

    print("Train matrix dimensionality: ", X_train_text.shape)
    print("Test matrix dimensionality: ", X_test_text.shape)

    return X_train_text, X_test_text

In [6]:
train_hosts, test_hosts, y_train = get_hosts_and_target()
X_train_graph, X_test_graph, G = get_graph_edges(train_hosts, test_hosts)
X_train_text, X_test_text = get_text_tfidf(train_hosts, test_hosts)

X_train = sparse.hstack((X_train_text,X_train_graph))

X_test = sparse.hstack((X_test_text,X_test_graph))

print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(len(y_train)))
print('X_test shape: {}'.format(X_test.shape))

65208
1642073
Train matrix dimensionality:  (800, 3)
Test matrix dimensionality:  (200, 3)
Train matrix dimensionality:  (800, 24328)
Test matrix dimensionality:  (200, 24328)
X_train shape: (800, 24331)
y_train shape: 800
X_test shape: (200, 24331)


# Study Graph and node2vec

In [8]:
i = 0
for edge in G.edges:
    print(edge)
    i+=1
    if i==10:
        break

('blog.com.gr', 'fmvoice.gr')
('blog.com.gr', 'papakishop.gr')
('blog.com.gr', 'rizospastis.gr')
('blog.com.gr', 'taxheaven.gr')
('blog.com.gr', 'karagilanis.gr')
('blog.com.gr', 'korinthiannews.gr')
('blog.com.gr', '902.gr')
('blog.com.gr', 'dscor.gr')
('blog.com.gr', 'diogenis-press.gr')
('blog.com.gr', 'parakato.gr')


In [9]:
from node2vec import Node2Vec

# Generate walks
node2vec = Node2Vec(G, dimensions=20, walk_length=16, num_walks=100)

# Learn embeddings 
model = node2vec.fit(window=10, min_count=1)

# save the model to disk
filename = 'node2vec_model.sav'
pickle.dump(model, open(filename, 'wb'))

Computing transition probabilities: 100%|██████████| 65208/65208 [17:35<00:00, 61.81it/s]  
Generating walks (CPU: 1): 100%|██████████| 100/100 [20:20<00:00, 12.26s/it]


In [53]:
# Generate walks
node2vec = Node2Vec(G, dimensions=20, walk_length=150, num_walks=150)

# Learn embeddings 
model = node2vec.fit(window=100, min_count=1)

# save the model to disk
filename = 'node2vec_model_100.sav'
pickle.dump(model, open(filename, 'wb'))

Computing transition probabilities: 100%|██████████| 65208/65208 [19:53<00:00, 54.66it/s]   
Generating walks (CPU: 1): 100%|██████████| 150/150 [6:10:46<00:00, 17.06s/it]    


In [54]:
node2vec_model = pickle.load(open(filename, 'rb'))

In [28]:
for i in range(0, 10):
    print('{host} - {target}'.format(host=train_hosts[i], target=y_train[i]))

goalpost.gr - athlitismos
sentragoal.gr - athlitismos
orangespotters.blogspot.gr - pliroforiki-diadiktyo
xanthinea.gr - eidiseis-mme
alfahost.gr - pliroforiki-diadiktyo
tro-ma-ktiko.blogspot.gr - pliroforiki-diadiktyo
ti-einai.gr - pliroforiki-diadiktyo
perizitito.gr - katastimata-agores
bambam.gr - pliroforiki-diadiktyo
deal-deal.gr - katastimata-agores


In [34]:
for node, _ in node2vec_model.most_similar('sentragoal.gr'):
    if node in train_hosts:
        print('{host} - {target}'.format(host=node, target=y_train[train_hosts.index(node)]))
    elif node in test_hosts:
        print('{host} - In test'.format(host=node))
    else:
        print('{host} - Neither in test nor in train'.format(host=node))

newsonly.gr - Neither in test nor in train
caranddriver.gr - eidiseis-mme
sportdog.gr - In test
feed.gr - Neither in test nor in train
tff.gr - diaskedasi-psyxagogia
cinemag.gr - diaskedasi-psyxagogia
gazzetta.gr - athlitismos
sport-fm.gr - athlitismos
pmbet.blogspot.gr - Neither in test nor in train
petsonly.gr - Neither in test nor in train


  """Entry point for launching an IPython kernel.


In [36]:
for node in G.neighbors('sentragoal.gr'):
    if node in train_hosts:
        print('{host} - {target}'.format(host=node, target=y_train[train_hosts.index(node)]))
    elif node in test_hosts:
        print('{host} - In test'.format(host=node))
    else:
        print('{host} - Neither in test nor in train'.format(host=node))

aoxalkisfc.gr - Neither in test nor in train
feed.gr - Neither in test nor in train
life.gr - eidiseis-mme
epo.gr - athlitismos
sport24.gr - athlitismos
hommemagazine.gr - Neither in test nor in train
esake.gr - Neither in test nor in train
ethnos.gr - eidiseis-mme
petsonly.gr - Neither in test nor in train
novasports.gr - Neither in test nor in train
novamedia.gr - Neither in test nor in train
thecookbook.gr - Neither in test nor in train
idanikospiti.gr - Neither in test nor in train
cinemag.gr - diaskedasi-psyxagogia
flynews.gr - Neither in test nor in train
gazzetta.gr - athlitismos
fenerbahceulkerbasket.blogspot.gr - Neither in test nor in train
tff.gr - diaskedasi-psyxagogia
newsonly.gr - Neither in test nor in train
protothema.gr - eidiseis-mme
imerisia.gr - Neither in test nor in train
e-go.gr - Neither in test nor in train
caranddriver.gr - eidiseis-mme
womenonly.gr - Neither in test nor in train
sportarena.gr - athlitismos
viva.gr - Neither in test nor in train
enwsi.gr - ath

### Notes:
* It seems that not all nodes in graph exist in either train or test.
* Similarity seems to work but not 100% from neighbors we get that a sports page can be connected to news and that same we see in similarity

In [55]:
similars = {}
for train_node in train_hosts:
    for node, _ in node2vec_model.most_similar(train_node):
        try:
            if node in train_hosts:
                similars[train_node].extend([y_train[train_hosts.index(node)]])
        except:
            if node in train_hosts:
                similars[train_node] = [y_train[train_hosts.index(node)]]
        '''
        try:
            if node in train_hosts:
                similars[train_node] = similars[train_node].extend({node:y_train[train_hosts.index(node)]})
        except:
            if node in train_hosts:
                similars[train_node] = [{node:y_train[train_hosts.index(node)]}]
        '''
# Check similar categories
for key, values in similars.items():
    print('{}-{}: {}'.format(y_train[train_hosts.index(key)], key, values))

  This is separate from the ipykernel package so we can avoid doing imports until


athlitismos-goalpost.gr: ['athlitismos']
athlitismos-sentragoal.gr: ['eidiseis-mme', 'athlitismos', 'diaskedasi-psyxagogia']
pliroforiki-diadiktyo-tro-ma-ktiko.blogspot.gr: ['pliroforiki-diadiktyo', 'eidiseis-mme']
pliroforiki-diadiktyo-ti-einai.gr: ['katastimata-agores']
katastimata-agores-deal-deal.gr: ['diaskedasi-psyxagogia']
eidiseis-mme-theinsider.gr: ['pliroforiki-diadiktyo']
eidiseis-mme-voreini.gr: ['eidiseis-mme']
athlitismos-overlap.gr: ['athlitismos']
athlitismos-ioniansports.gr: ['athlitismos']
pliroforiki-diadiktyo-sibilla-gr-sibilla.blogspot.gr: ['pliroforiki-diadiktyo', 'eidiseis-mme', 'pliroforiki-diadiktyo']
eidiseis-mme-allnewz.gr: ['eidiseis-mme']
eidiseis-mme-prevezatoday.gr: ['pliroforiki-diadiktyo']
athlitismos-prismanews.gr: ['eidiseis-mme', 'eidiseis-mme']
eidiseis-mme-bankwars.gr: ['eidiseis-mme', 'eidiseis-mme', 'pliroforiki-diadiktyo', 'pliroforiki-diadiktyo']
eidiseis-mme-liberal.gr: ['eidiseis-mme', 'eidiseis-mme', 'eidiseis-mme']
katastimata-agores-homete

# Models

## Logistic Regression

In [4]:
def run_logistic(X_train, y_train, X_test):
    # Use logistic regression to classify the webpages of the test set
    clf = LogisticRegression(solver='lbfgs', multi_class='auto')
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    
    return y_pred

def run_lightgbm(X_train, y_train, X_test):
    train_set = lightgbm.Dataset(X_train, label=y_train, params={'verbose': -1})
    train_params = {
        'objective': 'binary',
        'num_leaves': 32,
        'random_seed': 3333,
        'verbose': -1,
    }

    clf = LGBMClassifier(**train_params)
    clf.fit(X_train, y_train)

    y_pred = clf.predict_proba(X_test)

    return y_pred

# Write predictions to a file
def write_to_csv(file_name):
    with open(file_name+'.csv', 'w') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        lst = clf.classes_.tolist()
        lst.insert(0, "Host")
        writer.writerow(lst)
        for i,test_host in enumerate(test_hosts):
            lst = y_pred_light[i,:].tolist()
            lst.insert(0, test_host)
            writer.writerow(lst)

In [6]:
print('Logistic: {}'.format(run_logistic(X_train, y_train, X_test).mean()))
print('LightGBM: {}'.format(run_lightgbm(X_train, y_train, X_test).mean()))



Logistic: 0.2
LightGBM: 0.2




### Graph

In [2]:
train_hosts = list()
y_train_graph = list()
with open("train.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        train_hosts.append(l[0])
        y_train_graph.append(l[1][:-1])

# Read test data
test_hosts = list()
with open("test.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        test_hosts.append(l[0])

# Create a directed graph
G = nx.read_edgelist('edgelist.txt', delimiter=' ', create_using=nx.DiGraph())

print(G.number_of_nodes())
print(G.number_of_edges())

# Create the training matrix. Each row corresponds to a web host.
# Use the following 3 features for each web host:
# (1) out-degree of node
# (2) in-degree of node
# (3) average degree of neighborhood of node
X_train_graph = np.zeros((len(train_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=train_hosts)
for i in range(len(train_hosts)):
    X_train_graph[i,0] = G.in_degree(train_hosts[i])
    X_train_graph[i,1] = G.out_degree(train_hosts[i])
    X_train_graph[i,2] = avg_neig_deg[train_hosts[i]]

# Create the test matrix. Use the same 3 features as above
X_test_graph = np.zeros((len(test_hosts), 3))
avg_neig_deg = nx.average_neighbor_degree(G, nodes=test_hosts)
for i in range(len(test_hosts)):
    X_test_graph[i,0] = G.in_degree(test_hosts[i])
    X_test_graph[i,1] = G.out_degree(test_hosts[i])
    X_test_graph[i,2] = avg_neig_deg[test_hosts[i]]

print("Train matrix dimensionality: ", X_train_graph.shape)
print("Test matrix dimensionality: ", X_test_graph.shape)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X_train_graph, y_train_graph)
y_pred = clf.predict_proba(X_test_graph)

# Write predictions to a file
with open('sample_submission_graph.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

65208
1642073
Train matrix dimensionality:  (800, 3)
Test matrix dimensionality:  (200, 3)


## Text

In [8]:
train_hosts = list()
y_train_text = list()
with open("train.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        train_hosts.append(l[0])
        y_train_text.append(l[1][:-1])

test_hosts = list()
with open("test.csv", 'r') as f:
    for line in f:
        l = line.split(',')
        test_hosts.append(l[0])


text = dict()
filenames = os.listdir('domains/')
for filename in filenames:
    if filename[-4:] == '.zip':
        z = zipfile.ZipFile('domains/'+filename)
        contents = z.namelist()
        text[filename[:-4]] = ''
        for c in contents:
            f = z.open(c)
            data = f.read()
            text[filename[:-4]] += data.decode('utf16') #for windows: latin1
            f.close()

train_data = list()
for host in train_hosts:
    if host in text:
        train_data.append(text[host])
    else:
        train_data.append('')

    
# Create the training matrix. Each row corresponds to a webpage and each column to a word present in at least 10 webpages 
# and at most 50 webpages. The value of each entry in a row is equal to the frequency of that word in the corresponding
# webpage       
vec = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', min_df=10, max_df=50)
X_train_text = vec.fit_transform(train_data)

# Read webpages of the test set
test_data = list()
for host in test_hosts:
    if host in text:
        test_data.append(text[host])
    else:
        test_data.append('')


# Create the test matrix following the same approach as in the case of the training matrix
X_test_text = vec.transform(test_data)

print("Train matrix dimensionality: ", X_train_text.shape)
print("Test matrix dimensionality: ", X_test_text.shape)

# Use logistic regression to classify the webpages of the test set
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
clf.fit(X_train_text, y_train_text)
y_pred = clf.predict_proba(X_test_text)

# Write predictions to a file
with open('sample_submission_text.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = clf.classes_.tolist()
    lst.insert(0, "Host")
    writer.writerow(lst)
    for i,test_host in enumerate(test_hosts):
        lst = y_pred[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

Train matrix dimensionality:  (800, 24328)
Test matrix dimensionality:  (200, 24328)


In [17]:
X_train = sparse.hstack((X_train_text,X_train_graph))
y_train = y_train_text

X_test = sparse.hstack((X_test_text,X_test_graph))

print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(len(y_train)))
print('X_test shape: {}'.format(X_test.shape))

X_train shape: (800, 24331)
y_train shape: 800
X_test shape: (200, 24331)


# Playground

In [38]:
set(y_train)

{'athlitismos',
 'diaskedasi-psyxagogia',
 'eidiseis-mme',
 'katastimata-agores',
 'pliroforiki-diadiktyo'}

In [16]:
hosts = train_hosts + test_hosts
names = [h.split('.')[0] for h in hosts]

In [18]:
from collections import Counter


c = Counter()
for name in names:
    n = len(name)
    c.update(name[i:j] for i in range(n) for j in range(i + 1, n) if j - i > 4)

In [34]:
c.most_common()

[('sport', 37),
 ('aspor', 10),
 ('greek', 10),
 ('baske', 9),
 ('press', 7),
 ('pharm', 6),
 ('pharma', 6),
 ('harma', 6),
 ('asport', 6),
 ('porta', 6),
 ('basket', 6),
 ('asket', 6),
 ('laris', 6),
 ('trika', 6),
 ('trikal', 6),
 ('rikal', 6),
 ('plane', 6),
 ('sports', 6),
 ('ports', 6),
 ('cinem', 5),
 ('marke', 5),
 ('repor', 5),
 ('trikala', 5),
 ('rikala', 5),
 ('ikala', 5),
 ('lariss', 5),
 ('ariss', 5),
 ('-spor', 5),
 ('magaz', 5),
 ('volle', 5),
 ('volley', 5),
 ('olley', 5),
 ('music', 5),
 ('fitne', 4),
 ('fitnes', 4),
 ('itnes', 4),
 ('beaut', 4),
 ('perie', 4),
 ('ellin', 4),
 ('onlin', 4),
 ('athin', 4),
 ('pharmac', 4),
 ('harmac', 4),
 ('armac', 4),
 ('seaso', 3),
 ('dikai', 3),
 ('groti', 3),
 ('onian', 3),
 ('radio', 3),
 ('iaspo', 3),
 ('iaspor', 3),
 ('fashi', 3),
 ('fashio', 3),
 ('ashio', 3),
 ('famil', 3),
 ('sporta', 3),
 ('perier', 3),
 ('perierg', 3),
 ('erier', 3),
 ('erierg', 3),
 ('rierg', 3),
 ('anekd', 3),
 ('anekdo', 3),
 ('anekdot', 3),
 ('nekdo', 3)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
from sklearn.model_selection import GridSearchCV

