In [1]:
# pyspark --conf “spark.ui.port=10101”

!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 41 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=a9d26e2fa5fa69e5487f0c6b2edb56908b80f6592a3ee81abd5cee8a291d9e5a
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
The following additional packages will be installed:
  openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-m

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# Let's import the libraries we will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

In [4]:
sc = SparkContext.getOrCreate()
sc.stop()

# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

# DATA
### Upload the data and remove header

In [5]:
filename = "small_sample_data.csv"
#filename = "transaction_data.csv"

lines = sc.textFile(filename)

In [6]:
header = lines.first()
lines = lines.filter(lambda line: line != header)

# FILE with all pairs with normalized weights `(PRODUCT PAIRS ---> WEIGHTS)`


> `product_pairs_normalized.txt`: one line represents one tuple `(prod1_ID, prod2_ID, normalized_weight)` (situation (C) on Figure 3 in original paper).






### Helper functions
for data frame updates over lines

In [7]:
def make_list(line):
  """
    Input: one line of df of format as in original data
    Output: a pair (basket_ID, product_ID)
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  BASKET_ID = int(lst[1])
  PRODUCT_ID = int(lst[3])
  QUANTITY = int(lst[4])
  #return BASKET_ID, (PRODUCT_ID, QUANTITY)
  return BASKET_ID, PRODUCT_ID


# Use this function only if quantity of products in baskests need to be taken into account. 
# In this case, use the other return statement in function "make_list" first
def make_multiple_products(line):
  BASKET_ID = line[0]
  PRODUCT_ID = line[1][0]
  QUANTITY = line[1][1]

  new_lst = (BASKET_ID, QUANTITY * [PRODUCT_ID])

  return new_lst


def generate_pairs_of_elements(line):
  """
    Input: one line of format: (basket_id, (prod1_id, prod2_id, ... prodn_id))
    Output: pairs between all products with assigned weight, self-loops get weight 0: ((prod1_id, prod1_id, 0), (prod1_id, prod2_id, 1), ..., (prod1_id, prodn_id, 1), (prod2_id, prod1_id, 1), (prod2_id, prod2_id, 0), ...)
  """
  BASKET_ID = line[0]
  PRODUCTS_IDS = line[1]

  pairs = []

  if len(PRODUCTS_IDS) == 1:
    pairs.append(((PRODUCTS_IDS[0], PRODUCTS_IDS[0]), 0))
    return pairs

  else:
    for ID1 in PRODUCTS_IDS:
      pairs.append(((ID1, ID1), 0))
      for ID2 in PRODUCTS_IDS:
        if ID1 != ID2:
          pairs.append(((ID1, ID2), 1))

    return pairs

In [8]:
basket_list = lines.map(make_list)

In [9]:
baskets_content = basket_list.groupByKey().mapValues(list)

In [10]:
generated_pairs = baskets_content.flatMap(generate_pairs_of_elements)

In [11]:
unique_pairs = generated_pairs.groupByKey().mapValues(list)

In [12]:
sum_unique_pairs = unique_pairs.map(lambda pair: (pair[0], np.sum(pair[1])))

In [13]:
sum_unique_pairs_correction = sum_unique_pairs.map(lambda pair: (pair[0], 1 if pair[1] == 0 else pair[1]))

In [14]:
W = sum_unique_pairs_correction.map(lambda line: line[1])

In [15]:
norm = W.max()

In [16]:
normalized_unique_pairs = sum_unique_pairs_correction.map(lambda pair: (pair[0], pair[1] / norm))

In [17]:
#normalized_unique_pairs.saveAsTextFile("product_pairs_normalized.txt")

# FILE with content of every basket `(BASKET --- time stamp ---> PRODUCTS)`


> `baskets_content.txt`: lines formatted as `(Basket_ID, (time_stamp), [pairs between all products from that basket])`, bottom situation on Figure 3 in original paper.



### Helper functions
for data frame updates over lines

In [18]:
def make_line_basket_product(line):
  """
    Input: one line of df of format as in original data
    Output: (basket_ID, (week_no, day, trans_time)), product_ID
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  HOUSEHOLD_KEY = int(lst[0])

  WEEK_NO = int(lst[9])
  DAY = int(lst[2])
  TRANS_TIME = int(lst[8])

  BASKET_ID = int(lst[1])
  PRODUCT_ID = int(lst[3])

  return (BASKET_ID, (WEEK_NO, DAY, TRANS_TIME)), PRODUCT_ID



def KEY_generate_pairs_of_elements(line):
  """
    Input: one line of format: (basket_ID, (week_no, day, trans_time)), product
    Output: key + pairs between all products with assigned weight, self-loops get weight 0: ((prod1_id, prod1_id, 0), (prod1_id, prod2_id, 1), ..., (prod1_id, prodn_id, 1), (prod2_id, prod1_id, 1), (prod2_id, prod2_id, 0), ...)
  """
  KEY = line[0]
  print(KEY)
  PRODUCTS_IDS = line[1]
  print(PRODUCTS_IDS)

  pairs = []

  if len(PRODUCTS_IDS) == 1:
    pairs.append((PRODUCTS_IDS[0], PRODUCTS_IDS[0]))
    return KEY, pairs

  else:
    for ID1 in PRODUCTS_IDS:
      pairs.append((ID1, ID1))
      for ID2 in PRODUCTS_IDS:
        if ID1 != ID2:
          pairs.append((ID1, ID2))

  return KEY, pairs


In [113]:
house_time_products_line = lines.map(make_line_basket_product)

[((26984851472, (1, 1, 1631)), 1004906),
 ((26984851472, (1, 1, 1631)), 1033142),
 ((26984851472, (1, 1, 1631)), 1036325),
 ((26984851472, (1, 1, 1631)), 1082185),
 ((26984851472, (1, 1, 1631)), 8160430)]

In [20]:
grouped_basket = house_time_products_line.groupByKey().mapValues(list)

In [21]:
grouped_pairs = grouped_basket.map(KEY_generate_pairs_of_elements)

In [22]:
#grouped_pairs.saveAsTextFile("basket_content.txt")

# FILE with baskets of household `(HOUSEHOLD ---> BASKETS)`


> `household_basket.txt`: lines formatted as `(HOUSEHOLD_KEY, [all BASKET_IDs])`

### Helper functions
for data frame updates over lines

In [23]:
def make_line_household_basket(line):
  """
    Input: one line of df of format as in original data
    Output: household_key, basket_id
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  HOUSEHOLD_KEY = int(lst[0])
  BASKET_ID = int(lst[1])

  WEEK_NO = int(lst[9])
  DAY = int(lst[2])
  TRANS_TIME = int(lst[8])

  return HOUSEHOLD_KEY, (BASKET_ID, (WEEK_NO, DAY, TRANS_TIME))

def make_unique_list_of_vals(line):
  key = line[0]
  vals_list = line[1]
  return key, list(set(vals_list))

In [24]:
house_basket_line = lines.map(make_line_household_basket)

In [25]:
grouped_by_house = house_basket_line.groupByKey().mapValues(list)
grouped_by_house_unique = grouped_by_house.map(make_unique_list_of_vals)

In [26]:
#grouped_by_house_unique.saveAsTextFile("household_basket.txt")

--------
-----
-----

In [148]:
LIST1 = normalized_unique_pairs.collect()

In [151]:
  dct1 = {}
  for element in LIST1:
    pair = element[0]
    weight = element[1]
    dct1[pair] = (pair, weight)

In [152]:
def change_pairs(dictionary):
  def change_pairs_(line):
    new_lst = [dictionary[pair] for pair in line[1]]
    return line[0], new_lst
  return change_pairs_

In [153]:
# JOIN NORMALIZED_UNIQUE_PAIRS and GROUPED_PAIRS ! 
basket_normalized_pairs = grouped_pairs.map(change_pairs(dct1))

In [166]:
grouped_by_house_unique.take(1)

[(1364, [(26984896261, (1, 1, 1520))])]

In [156]:
LIST2 = basket_normalized_pairs.collect()

In [158]:
  dct2 = {}
  for element in LIST2:
    basket = element[0]
    pairs = element[1]
    dct2[basket] = (basket, pairs)

In [161]:
def join_all(dictionary):
  def join_all_(line):
    new_lst = [dictionary[basket] for basket in line[1]]
    return line[0], new_lst
  return join_all_

In [164]:
# JOIN ALL
joined = grouped_by_house_unique.map(join_all(dct2))

In [167]:
#joined.saveAsTextFile("JOINED.txt")