In [1]:
# pyspark --conf “spark.ui.port=10101”

!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 43.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=7e83e659c02fda909e8171eb25cdfbc7afa85158ed50c62bd794fcf92fc42e84
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
The following additional packages will be installed:
  openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-m

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# Let's import the libraries we will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

In [4]:
sc = SparkContext.getOrCreate()
sc.stop()

# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

# DATA
### Upload the data and remove header

In [74]:
#filename = "small_sample_data.csv"
#filename = "sample.csv"
#filename = "transaction_data_365.csv"
filename = "transaction_data_150.csv"
#filename = "transaction_data_smaller.csv"
lines = sc.textFile(filename)

In [75]:
header = lines.first()
lines = lines.filter(lambda line: line != header)

# FILE with all pairs with normalized weights `(PRODUCT PAIRS ---> WEIGHTS)`


> `product_pairs_normalized.txt`: one line represents one tuple `(prod1_ID, prod2_ID, normalized_weight)` (situation (C) on Figure 3 in original paper).






### Helper functions
for data frame updates over lines

In [76]:
def make_list(line):
  """
    Input: one line of df of format as in original data
    Output: a pair (basket_ID, product_ID)
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  BASKET_ID = int(lst[1])
  PRODUCT_ID = int(lst[3])
  QUANTITY = int(lst[4])
  #return BASKET_ID, (PRODUCT_ID, QUANTITY)
  return BASKET_ID, PRODUCT_ID


# Use this function only if quantity of products in baskests need to be taken into account. 
# In this case, use the other return statement in function "make_list" first
def make_multiple_products(line):
  BASKET_ID = line[0]
  PRODUCT_ID = line[1][0]
  QUANTITY = line[1][1]

  new_lst = (BASKET_ID, QUANTITY * [PRODUCT_ID])

  return new_lst


def generate_pairs_of_elements(line):
  """
    Input: one line of format: (basket_id, (prod1_id, prod2_id, ... prodn_id))
    Output: pairs between all products with assigned weight, self-loops get weight 0: ((prod1_id, prod1_id, 0), (prod1_id, prod2_id, 1), ..., (prod1_id, prodn_id, 1), (prod2_id, prod1_id, 1), (prod2_id, prod2_id, 0), ...)
  """
  BASKET_ID = line[0]
  PRODUCTS_IDS = line[1]

  pairs = []

  if len(PRODUCTS_IDS) == 1:
    pairs.append(((PRODUCTS_IDS[0], PRODUCTS_IDS[0]), 0))
    return pairs

  else:
    for ID1 in PRODUCTS_IDS:
      pairs.append(((ID1, ID1), 0))
      for ID2 in PRODUCTS_IDS:
        if ID1 != ID2:
          pairs.append(((ID1, ID2), 1))

    return pairs

In [77]:
basket_list = lines.map(make_list)

In [78]:
baskets_content = basket_list.groupByKey().mapValues(list)

In [79]:
generated_pairs = baskets_content.flatMap(generate_pairs_of_elements)

In [80]:
unique_pairs = generated_pairs.groupByKey().mapValues(list)

In [81]:
sum_unique_pairs = unique_pairs.map(lambda pair: (pair[0], np.sum(pair[1])))

In [82]:
sum_unique_pairs_correction = sum_unique_pairs.map(lambda pair: (pair[0], 1 if pair[1] == 0 else pair[1]))

In [83]:
W = sum_unique_pairs_correction.map(lambda line: line[1])

In [86]:
# ČASOVNO POTRATNO !!!!

#norm = W.max()

# ZA CELOTNE PODATKE
#norm = 3447

# ZA ENO LETO (vključno 365 dan)
#norm = 1613

# ZA SAMPLE (Sara)
#norm = 171

# ZA 150
norm = 471

In [87]:
normalized_unique_pairs = sum_unique_pairs_correction.map(lambda pair: (pair[0], pair[1] / norm))

# FILE with content of every basket `(BASKET --- time stamp ---> PRODUCTS)`


> `baskets_content.txt`: lines formatted as `(Basket_ID, (time_stamp), [pairs between all products from that basket])`, bottom situation on Figure 3 in original paper.



### Helper functions
for data frame updates over lines

In [88]:
def make_line_basket_product(line):
  """
    Input: one line of df of format as in original data
    Output: (basket_ID, (week_no, day, trans_time)), product_ID
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  HOUSEHOLD_KEY = int(lst[0])

  WEEK_NO = int(lst[9])
  DAY = int(lst[2])
  TRANS_TIME = int(lst[8])

  BASKET_ID = int(lst[1])
  PRODUCT_ID = int(lst[3])

  return (BASKET_ID, (WEEK_NO, DAY, TRANS_TIME)), PRODUCT_ID



def KEY_generate_pairs_of_elements(line):
  """
    Input: one line of format: (basket_ID, (week_no, day, trans_time)), product
    Output: key + pairs between all products with assigned weight, self-loops get weight 0: ((prod1_id, prod1_id, 0), (prod1_id, prod2_id, 1), ..., (prod1_id, prodn_id, 1), (prod2_id, prod1_id, 1), (prod2_id, prod2_id, 0), ...)
  """
  KEY = line[0]
  PRODUCTS_IDS = line[1]

  pairs = []

  if len(PRODUCTS_IDS) == 1:
    pairs.append((PRODUCTS_IDS[0], PRODUCTS_IDS[0]))
    return ((KEY, pair) for pair in pairs)

  else:
    for ID1 in PRODUCTS_IDS:
      pairs.append((ID1, ID1))
      for ID2 in PRODUCTS_IDS:
        if ID1 != ID2:
          pairs.append((ID1, ID2))

  return ((KEY, pair) for pair in pairs)


In [89]:
house_time_products_line = lines.map(make_line_basket_product)

In [90]:
grouped_basket = house_time_products_line.groupByKey().mapValues(list)

In [91]:
grouped_pairs = grouped_basket.flatMap(KEY_generate_pairs_of_elements)

# FILE with baskets of household `(HOUSEHOLD ---> BASKETS)`


> `household_basket.txt`: lines formatted as `(HOUSEHOLD_KEY, [all BASKET_IDs])`

### Helper functions
for data frame updates over lines

In [92]:
def make_line_household_basket(line):
  """
    Input: one line of df of format as in original data
    Output: household_key, basket_id
  """
  splitted = line.split()

  lst = splitted[0].split(",")
  lst = list(map(lambda x: x, lst))

  HOUSEHOLD_KEY = int(lst[0])
  BASKET_ID = int(lst[1])

  WEEK_NO = int(lst[9])
  DAY = int(lst[2])
  TRANS_TIME = int(lst[8])

  return HOUSEHOLD_KEY, (BASKET_ID, (WEEK_NO, DAY, TRANS_TIME))

def make_unique_list_of_vals(line):
  key = line[0]
  vals_list = line[1]
  unique_list = list(set(vals_list))
  return key, sorted(unique_list, key = lambda x: x[1])

In [93]:
house_basket_line = lines.map(make_line_household_basket)

In [94]:
grouped_by_house = house_basket_line.groupByKey().flatMapValues(list).distinct()

--------
-----
-----

In [95]:
normalized = normalized_unique_pairs.map(lambda x: (x[0], x))

In [96]:
BASKET_NORMALIZED_PROD_JOINED = grouped_pairs.map(lambda line: (line[1], line[0])).join(normalized).map(lambda line: line[1])


In [97]:
BASKET_PROD_GROUPED = BASKET_NORMALIZED_PROD_JOINED.groupByKey().mapValues(list)

In [98]:
def sort_baskets_time(line):
  key = line[0]
  return key, sorted(line[1], key = lambda x : x[0][0])

In [99]:
A = grouped_by_house.map(lambda line: (line[1], line[0])).join(BASKET_PROD_GROUPED).map(lambda line: (line[1][0], (line[0], line[1][1])))

In [100]:
# ZA VSAK HOUSEHOLD SO ZAPISANE VSE KOŠARICE Z VSEBINO, KOŠARICE SO UREJENE PO ČASU
JOINED = A.groupByKey().mapValues(list).map(sort_baskets_time)

In [101]:
def make_prod_list(line):
  prods = []
  list_of_baskets = line[1] 
  for i in range(len(list_of_baskets)):
    list_of_prods = list_of_baskets[i][1]
    for j in range(len(list_of_prods)):
      prod1, prod2 = line[1][i][1][j][0]
      prods.append(prod1)
      prods.append(prod2)
  return line[0], prods


In [102]:
# ČASOVNO POTRATNO !!! 1h 10min za podatke enega leta
products_for_house = JOINED.map(make_prod_list).collect()

In [103]:
# SHRANIMO ZGORDNJI SEZNAM
import pickle

with open('products_for_house.pkl.gz', 'wb') as f:
    pickle.dump(products_for_house, f)

In [104]:
prod_for_house = {}
for element in products_for_house:
  prod_for_house[element[0]] = element[1]

In [105]:
import networkx as nx
from google.colab import files

def make_graphs(prod_for_house):
  def make_graphs_(line):
    house_ID = line[0]
    # Teli nodi bodo v vseh grafih od tega house IDja
    nodes = prod_for_house[house_ID]
    baskets_number = len(line[1])

    graphs = []

    # Za vsako košarko svoj graf
    for i in range(baskets_number):
      G = nx.Graph()
      G.add_nodes_from(nodes)
      basket_ID = line[1][i][0]
      products_number = len(line[1][i][1])

      for j in range(products_number):
        n1, n2 = line[1][i][1][j][0]
        W = line[1][i][1][j][1]
        G.add_edge(n1, n2, weight = W)

      # Zapišemo in shranimo graf
      graphs.append(G)
      #nx.write_pajek(G, f"GRAPHS/{house_ID}_{i}.net")
      #files.download(f"GRAPHS/{house_ID}_{i}.net")

    return house_ID, graphs
  return make_graphs_

In [106]:
GRAPHS_RDD = JOINED.map(make_graphs(prod_for_house))

In [107]:
# ČASOVNO ZAHTEVNO!!!!
GRAPHS = GRAPHS_RDD.collect()

In [108]:
# SHRANIMO ZGORDNJI SEZNAM GRAFOV
import pickle

with open('graphs_list.pkl.gz', 'wb') as f:
    pickle.dump(GRAPHS, f)

In [109]:
for element in GRAPHS:
  house_ID = element[0]
  list_of_graphs = element[1]
  for i, G in enumerate(list_of_graphs):
    nx.write_pajek(G, f"Graphs/{house_ID}_{i}.net")

In [110]:
!zip -r /content/Graphs.zip /content/Graphs

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/Graphs/513_33.net (deflated 76%)
  adding: content/Graphs/633_10.net (deflated 75%)
  adding: content/Graphs/768_22.net (deflated 75%)
  adding: content/Graphs/1350_30.net (deflated 76%)
  adding: content/Graphs/1652_8.net (deflated 82%)
  adding: content/Graphs/2204_6.net (deflated 75%)
  adding: content/Graphs/216_5.net (deflated 74%)
  adding: content/Graphs/1941_11.net (deflated 72%)
  adding: content/Graphs/906_34.net (deflated 79%)
  adding: content/Graphs/1901_20.net (deflated 76%)
  adding: content/Graphs/724_10.net (deflated 77%)
  adding: content/Graphs/2223_158.net (deflated 76%)
  adding: content/Graphs/2057_1.net (deflated 69%)
  adding: content/Graphs/2488_4.net (deflated 81%)
  adding: content/Graphs/2419_0.net (deflated 75%)
  adding: content/Graphs/2319_85.net (deflated 75%)
  adding: content/Graphs/2478_15.net (deflated 76%)
  adding: content/Graphs/928_28.net (deflated 76%)
  adding: c

In [111]:
from google.colab import files
files.download("/content/Graphs.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>