In [1]:
import pyspark
import sys
from pyspark.sql import SparkSession
from pyspark.context import SparkContext , SparkConf
import random
import time

In [2]:
conf = SparkConf().setAppName("PySpark App")
conf.set("spark.executor.heartbeatInterval","1000s")
conf.set("spark.network.timeout","2000")

<pyspark.conf.SparkConf at 0x1ecdad2dda0>

In [3]:
data_train = "lyrl2004_vectors_train.dat"
topic_files = "rcv1-v2.topics.qrels"
data_test_0 = "lyrl2004_vectors_test_pt0.dat"
data_test_1 = "lyrl2004_vectors_test_pt1.dat"

In [4]:
sc = SparkContext('local[8]',conf = conf)

In [5]:
def generate_dictionary(datapoint):
    ''' 
    Parses and generates a dictionary from one sparse datapoint. 
    From Hogwild python implementation
    '''
    d = {0: 1.0} # Adding the bias
    for elem in datapoint:
        elem = elem.split(':')
        d[int(elem[0])] = float(elem[1])
    return d

In [6]:
def load_data_spark(sc,data_ = data_train, topics_path=topic_files, selected_cat='CCAT'):
    '''
    Function to load the data (we are not using spark here but we could later on during the project)
    sc : spark context
    '''
    rdd = sc.textFile(data_).map(lambda line: line.strip("")).map(lambda line: line.split(' '))
    labels = rdd.map(lambda line: int(line[0]))
    data = rdd.map(lambda line: generate_dictionary(line[2:]))
    labels = labels.collect()

    cat = get_category_dict(topics_path)
    labels = [1 if selected_cat in cat[label] else -1 for label in labels]

    return data, labels

In [None]:
'''
data_train = data_train.collect()
data_test_0 = data_test_0.collect()
data_test_1 = data_test_1.collect()
'''

In [7]:
def load_data(data_ = data_train, topics_path=topic_files, selected_cat='CCAT'):
    '''
    Function to load the data (we are not using spark here but we could later on during the project)
    sc : spark context
    '''
    with open(data_) as f:
        content = f.readlines()
        content = [line.strip() for line in content]
        content = [line.split(' ') for line in content]
        labels = [int(line[0]) for line in content]
        data = [generate_dictionary(line[2:]) for line in content]

    cat = get_category_dict(topics_path)
    labels = [1 if selected_cat in cat[label] else -1 for label in labels]

    return data, labels

In [8]:
def get_category_dict(topics_path):
    ''' Generates the category dictionary using the topics file from:
    http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
    From Hogwild python implementation
    '''
    categories = {}
    with open(topics_path) as f:
        content = f.readlines()
        content = [line.strip() for line in content]
        content = [line.split(' ') for line in content]
        for line in content:
            id = int(line[1])
            cat = line[0]
            if id not in categories:
                categories[id] = [cat]
            else:
                categories[id].append(cat)
    return categories

In [9]:
data_train, labels_train = load_data()
data_test_0, labels_test_0 = load_data(data_= data_test_0)
data_test_1, labels_test_1 = load_data(data_= data_test_1)

In [10]:
data_train += data_test_0
labels_train += labels_test_0

In [11]:
data_test = data_test_1
labels_test = labels_test_1

In [35]:
w = {}
num_examples = 1
n_workers = 5
batch_per_worker = 1000
batch_size = batch_per_worker * n_workers
lambda_ = 0.03/batch_size
reg = 0.00001
n_iterations = 100

In [36]:
def dotproduct(x, w):
    ''' Calculates the dotproduct for sparse x and w. '''
    return sum([v * w.get(k, 0) for k, v in x.items()])

def sign(x):
    ''' Sign function '''
    return 1 if x > 0 else -1 if x < 0 else 0

In [37]:
def accuracy(iterator):
    score = 0
    for x in iterator:
        for elem in x:
            if elem[0] == elem[1]:
                score +=1
    yield score

In [38]:
def hinge_loss(y,x,w):
    '''
    Compute the value of the hinge loss
    x: sparse_vector
    y: label
    w: weigths vector
    '''
    return max(1 - y * dotproduct(x,w),0)

In [39]:
def calculate_primal_objective(y,x,w,lambda_,batchsize):
    """	
    compute the full cost (the primal objective), that is loss plus regularizer.
    """
    v = hinge_loss(y, x, w)
    return v/batchsize + lambda_ / 2 * sum([(weight**2)/batchsize for weight in w.values()])

In [40]:
def calculate_stochastic_gradient(x_n,y_n, w, lambda_):
    """compute the stochastic gradient of loss plus regularizer.
    w: shape = (num_features)
    num_examples: N
    """
    grad = {}
    
    def is_support(y_n, x_n, w):
        """a datapoint is support if max{} is not 0. """
        return y_n * dotproduct(x_n,w) < 1
    
    supp = is_support(y_n, x_n, w)
    
    for k,v in x_n.items():
        if supp:
            grad[k] = -v*y_n
        else:
            grad[k] = 0
        
    reguralizer = {}
    for k in x_n.keys():
        reguralizer[k] = w.get(k, 0)**2 * reg
    
    for k,v in grad.items():
        grad[k] = lambda_* v + reguralizer[k]
    
    return grad

In [41]:
def prediction(iterator,w):
    pred = []
    for x in iterator:
        temp = (dotproduct(x[0],w) > 0) * 2 - 1
        y = x[1]
        pred.append((temp,y))
    yield pred

In [42]:
def train(iterator,w):
    weights = {}
    for x in iterator:
        grad = calculate_stochastic_gradient(x[0],x[1],w,lambda_)
        for k, v in grad.items():
            weights[k] = v + weights.get(k, 0)
    yield weights

In [43]:
def batch_iter(x,y,batch_size):
    y_batch = []
    x_batch = []
    indices = random.sample(range(len(x)),batch_size)
    for indice in indices:
        x_batch.append(x[indice])
        y_batch.append(y[indice])
    return x_batch, y_batch

In [44]:
def compute_loss(iterator,w,lambda_,batchsize):
    """
    compute the loss for a given iterator
    """
    loss = 0
    for x in iterator:
        loss += calculate_primal_objective(x[1],x[0],w,lambda_,batchsize)
    yield loss

In [45]:
total_time = 0
for i in range(n_iterations):
    
    print('we are at the iteration : {}'.format(i))
    
    # computing the gradient
    start = time.time()
    x_batch, y_batch = batch_iter(data_train,labels_train,batch_size= batch_size)
    
    sgd = sc.parallelize(zip(x_batch,y_batch),numSlices= n_workers) \
    .mapPartitions(lambda it: train(it,w)) \
    .reduce(lambda x,y:{k: x.get(k, 0) + y.get(k, 0) for k in set(x) | set(y)})
    
    for k,v in sgd.items():
        w[k] = w.get(k, 0) - v
        
    end = time.time()
    print(end - start)
    total_time += end - start
    
    print("compute loss")
    loss = sc.parallelize(zip(x_batch,y_batch),numSlices= n_workers) \
    .mapPartitions(lambda it: compute_loss(it,w,lambda_,batch_size)) \
    .reduce(lambda x,y: x+y)
    print('the model as a loss of:')
    print(loss)
    
    print("compute train accuracy")
    acc = sc.parallelize(zip(x_batch,y_batch), numSlices= n_workers) \
    .mapPartitions(lambda it: prediction(it,w)) \
    .mapPartitions(lambda it: accuracy(it)) \
    .reduce(lambda x,y: x+y)
    acc = acc/len(y_batch)
    print('the model as achieved an accuracy of:')
    print(acc)
    
    print('computing test accuracy')
    # computing the accuracy
    test_x_batch, test_y_batch = batch_iter(data_test,labels_test,batch_size= batch_size)
    acc = sc.parallelize(zip(test_x_batch,test_y_batch), numSlices= n_workers) \
    .mapPartitions(lambda it: prediction(it,w)) \
    .mapPartitions(lambda it: accuracy(it)) \
    .reduce(lambda x,y: x+y)
    acc = acc/len(test_y_batch)
    print('the model as achieved an accuracy of:')
    print(acc)

we are at the iteration : 0
2.659883737564087
compute loss
the model as a loss of:
0.9998894386245216
compute train accuracy
the model as achieved an accuracy of:
0.714
computing test accuracy
the model as achieved an accuracy of:
0.6548
we are at the iteration : 1
2.6409389972686768
compute loss
the model as a loss of:
0.9997860067019285
compute train accuracy
the model as achieved an accuracy of:
0.8214
computing test accuracy
the model as achieved an accuracy of:
0.8056
we are at the iteration : 2


KeyboardInterrupt: 