In [1]:
!pip install pyspark
!pip install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 28 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 36.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=0c9f303e907b1a11bca73ed885b00737073073eddad402178856b067ddad6223
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark

In [50]:
# Current file = queries.csv

# Transform the file to tsv format
import csv

with open('queries.csv','r') as csvin, open('queries.tsv', 'w') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')
    for row in csvin:
        tsvout.writerow(row)

from pyspark.sql import SparkSession,Row, Column
import pyspark.sql.functions as F
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
queries_file = 'queries.tsv'
df = spark.read.csv(queries_file, header='True', inferSchema='True', sep='\t')


column_names = ["genres", "lang", "actors", "director", "cities", "country", 
                "from_realese_date", "production_company"]
# For all the above columns
for name in column_names:
  temp_name_1 = name + "1"
  temp_name_2 = name + "2"
  # Removing irrelevant chars
  df = df.select("*", F.translate(F.col(name), "'[]", "")\
                .alias(temp_name_1))\
  .drop(name)

  # Converting arrays strings to arrays of strings
  df = df.select("*", F.split(F.col(temp_name_1),",").alias(temp_name_2)) \
      .drop(temp_name_1)
  df = df.withColumnRenamed(temp_name_2,name)

print("Queries table:")
df.show()
queries_df = df

# ------------------------------------------------------------------------------

# Current file: credits.csv
import re
credits_file = 'credits.csv'
df = spark.read.csv(credits_file, header='True', inferSchema='True')

# Load the data (given code)
credits = spark.read.format("csv")\
.option("delimiter", "\t")\
.option("header","true")\
.option("inferSchema", "true")\
.load("credits.csv")
prog = re.compile('\\[(.*?)\\]')
second_match = F.udf(lambda x: prog.findall(x)[1])
id_extract = F.udf(lambda x: x.split(",")[-1])
credits = credits\
.withColumn("id", id_extract("cast,crew,id"))\
.withColumn("cast", F.regexp_extract(F.col("cast,crew,id"), '\\[(.*?)\\]', 0
))\
.withColumn("crew", F.concat(F.lit("["),second_match("cast,crew,id"), F.lit(
"]")))\
.select("cast", "crew", "id")

df = credits
column_names = ["cast", "crew"]
# For all the above columns
for name in column_names:
  temp_name_1 = name + "1"
  temp_name_2 = name + "2"
  # Removing irrelevant chars
  df = df.select("*", F.translate(F.col(name), "\\{\\[\\]'\\}", "")\
                .alias(temp_name_1))\
  .drop(name)

  # Converting arrays strings to arrays of strings
  df = df.select("*", F.split(F.col(temp_name_1),",").alias(temp_name_2)) \
      .drop(temp_name_1)
  df = df.withColumnRenamed(temp_name_2,name)

# For cast column - udf for extracting actors' names only from cast json string
actors_udf = F.udf(lambda arr: [arr[i][7:] for i in range(len(arr)) if i % 8 == 5])
df = df.withColumn('actors', actors_udf(F.col("cast")))\
  .drop("cast")

# For crew column - udf for extracting directors' names only from crew json string
directors_udf = F.udf(lambda arr: [arr[i+1][7:] for i in range(len(arr))
 if arr[i] == " job: Director"])
df = df.withColumn('directors', directors_udf(F.col("crew")))\
  .drop("crew")

# Converting arrays strings to arrays of strings
column_names = ["actors", "directors"]
for name in column_names:
  temp_name_1 = name + "1"
  temp_name_2 = name + "2"
  # Removing irrelevant chars
  df = df.select("*", F.translate(F.col(name), "\\{\\[\\]'\\}", "")\
                .alias(temp_name_1))\
  .drop(name)

  # Converting arrays strings to arrays of strings
  df = df.select("*", F.split(F.col(temp_name_1),",").alias(temp_name_2)) \
      .drop(temp_name_1)
  df = df.withColumnRenamed(temp_name_2,name)

print("Credits table:")
df.show()
credits_df = df

# ------------------------------------------------------------------------------

# Current file: movies.csv
movies_file = 'movies.csv'
df = spark.read.csv(movies_file, header='True', inferSchema='True')

# Doing the same process for all columns
column_names = ["genres", "production_companies", "production_countries", 
                "spoken_languages", "cities"]
for name in column_names:
  temp_name_1 = name + "1"
  temp_name_2 = name + "2"
  # Removing irrelevant chars
  df = df.select("*", F.translate(F.col(name), "\\{\\[\\]'\\}", "")\
                .alias(temp_name_1))\
  .drop(name)

  # Converting arrays strings to arrays of strings
  df = df.select("*", F.split(F.col(temp_name_1),",").alias(temp_name_2)) \
      .drop(temp_name_1)
  df = df.withColumnRenamed(temp_name_2,name)

# Finished working on cities column, and seperating production_companies
# because it has different structure
column_names = ["genres", "production_countries", "spoken_languages"]

# For each column - udf for extracting names only from json string
name_udf = F.udf(lambda arr: [arr[i][7:] for i in range(len(arr)) if i % 2 == 1])
for c in column_names:
  c_1 = c + "1"
  df = df.withColumn(c_1, name_udf(F.col(c)))\
    .drop(c)
prod_udf = F.udf(lambda arr: [arr[i][6:] for i in range(len(arr)) if i % 2 == 0])
df = df.withColumn("production_companies1", prod_udf(F.col("production_companies")))\
  .drop("production_companies")

# Renameing columns names
column_names = ["genres", "production_companies", "production_countries", 
                "spoken_languages"]
for name in column_names:
  current_name = name + "1"
  df = df.withColumnRenamed(current_name,name)

print("Movies table:")
df.show()
movies_df = df


Queries table:
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|user_id|              genres|                lang|              actors|            director|              cities|             country|from_realese_date|  production_company|
+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    981| [Western,  Mystery]|  [English,  Srpski]|                  []|      [Nae Caranfil]|  [Haifa,  Tiberias]|                  []|           [2012]|[Katakuri-ke no K...|
|   3775|  [Action,  Western]|           [English]|                  []|                  []|          [Tel Aviv]|                  []|           [2013]|[Clavius Base,  T...|
|   4095|             [Crime]|[English,  עִבְרִית]|[Kenneth Alton,  ...|                  []|         [Jerusal

In [None]:
# Section 2 - Data Analysis

# Insight 1: Tel Aviv is the most queried city with 44721 lookups, Jerusalem is right after with 34561
# and then Haifa, Tiberias and Eilat with 22712, 9594 and 17348 lookups respectively
queries_itr = queries_df.rdd.toLocalIterator()
counters = {"Tel Aviv": 0, "Jerusalem": 0, "Haifa": 0, "Tiberias": 0, "Eilat": 0,
          " Tel Aviv": 0, " Jerusalem": 0, " Haifa": 0, " Tiberias": 0, " Eilat": 0}
for row in queries_itr:
  for city in row.cities:
    counters[city] += 1

counters_final = {"Tel Aviv": counters["Tel Aviv"] + counters[" Tel Aviv"],
                "Jerusalem": counters["Jerusalem"] + counters[" Jerusalem"],
                "Haifa": counters["Haifa"] + counters[" Haifa"],
                "Tiberias": counters["Tiberias"] + counters[" Tiberias"],
                "Eilat": counters["Eilat"] + counters[" Eilat"]}

print("Insight 1: Tel Aviv is the most queried city with 44721 lookups, Jerusalem is right after with 34561, and then Haifa, Tiberias and Eilat with 22712, 9594 and 17348 lookups respectively")
print(counters_final)
print()

# Insight 2: Jerusalemites and Tel-Avivians are almost the only ones to search for movies in Hebrew
queries_itr = queries_df.rdd.toLocalIterator()
counters = {"Tel Aviv": 0, "Jerusalem": 0, "Haifa": 0, "Tiberias": 0, "Eilat": 0,
            " Tel Aviv": 0, " Jerusalem": 0, " Haifa": 0, " Tiberias": 0, " Eilat": 0}

for row in queries_itr:
  if " עִבְרִית" in row.lang:
    for city in row.cities:
      counters[city] += 1

counters_final = {"Tel Aviv": counters["Tel Aviv"] + counters[" Tel Aviv"],
                  "Jerusalem": counters["Jerusalem"] + counters[" Jerusalem"],
                  "Haifa": counters["Haifa"] + counters[" Haifa"],
                  "Tiberias": counters["Tiberias"] + counters[" Tiberias"],
                  "Eilat": counters["Eilat"] + counters[" Eilat"]}

print("Insight 2: Jerusalemites and Tel-Avivians are almost the only ones to search for movies in Hebrew:")
print(counters_final)
print()


# Insight 3: Jerusalemites care about the playing actors, Haifa people care 
# about directors, and Eilaties alaways specify countries in their queries.
print("Insight 3: Jerusalemites care about the playing actors, Haifa people care about directors, and Eilaties alaways specify countries in their queries.")
queries_itr = queries_df.rdd.toLocalIterator()
counters = {"Tel Aviv": 0, "Jerusalem": 0, "Haifa": 0, "Tiberias": 0, "Eilat": 0,
          " Tel Aviv": 0, " Jerusalem": 0, " Haifa": 0, " Tiberias": 0, " Eilat": 0}
for row in queries_itr:
  if row.actors[0] == "":
    for city in row.cities:
      counters[city] += 1

counters_final = {"Tel Aviv": counters["Tel Aviv"] + counters[" Tel Aviv"],
                "Jerusalem": counters["Jerusalem"] + counters[" Jerusalem"],
                "Haifa": counters["Haifa"] + counters[" Haifa"],
                "Tiberias": counters["Tiberias"] + counters[" Tiberias"],
                "Eilat": counters["Eilat"] + counters[" Eilat"]}

print()
print("actors empty by city:")
print(counters_final)
print()

queries_itr = queries_df.rdd.toLocalIterator()
counters = {"Tel Aviv": 0, "Jerusalem": 0, "Haifa": 0, "Tiberias": 0, "Eilat": 0,
          " Tel Aviv": 0, " Jerusalem": 0, " Haifa": 0, " Tiberias": 0, " Eilat": 0}
for row in queries_itr:
  if row.director[0] == "":
    for city in row.cities:
      counters[city] += 1

counters_final = {"Tel Aviv": counters["Tel Aviv"] + counters[" Tel Aviv"],
                "Jerusalem": counters["Jerusalem"] + counters[" Jerusalem"],
                "Haifa": counters["Haifa"] + counters[" Haifa"],
                "Tiberias": counters["Tiberias"] + counters[" Tiberias"],
                "Eilat": counters["Eilat"] + counters[" Eilat"]}

print("director empty by city:")
print(counters_final)
print()

queries_itr = queries_df.rdd.toLocalIterator()
counters = {"Tel Aviv": 0, "Jerusalem": 0, "Haifa": 0, "Tiberias": 0, "Eilat": 0,
          " Tel Aviv": 0, " Jerusalem": 0, " Haifa": 0, " Tiberias": 0, " Eilat": 0}
for row in queries_itr:
  if row.country[0] == "":
    for city in row.cities:
      counters[city] += 1

counters_final = {"Tel Aviv": counters["Tel Aviv"] + counters[" Tel Aviv"],
                "Jerusalem": counters["Jerusalem"] + counters[" Jerusalem"],
                "Haifa": counters["Haifa"] + counters[" Haifa"],
                "Tiberias": counters["Tiberias"] + counters[" Tiberias"],
                "Eilat": counters["Eilat"] + counters[" Eilat"]}

print("country empty by city:")
print(counters_final)
print()

# Insight 4: people who look for movies in Eilat, never look for movies in over places in the same query
queries_itr = queries_df.rdd.toLocalIterator()
city_lookup_dict = {}
for row in queries_itr:
  key = " ".join(str(x) for x in row.cities)
  if key not in city_lookup_dict.keys():
    city_lookup_dict[key] = 1
  else:
    city_lookup_dict[key] += 1
print("Insight 4: people who look for movies in Eilat, never look for movies in over places in the same query")
print("Lookups for cinemas in different cities counts")
print(city_lookup_dict)
print()

# Insight 5: the number of tickets in reservations are more or less in the same amount and from the same distribution
print("Insight 5: the number of tickets in reservations are more or less in the same amount and from the same distribution")
import pandas as pd
tickets_by_city = [["Tel Aviv", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ["Jerusalem", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                   ["Haifa", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ["Tiberias", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                   ["Eilat", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
tickets_by_city = pd.DataFrame(tickets_by_city, columns=["City", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
tickets_by_city.set_index("City", inplace=True)

tickets_df = spark.read.csv('tickets.csv', header='True', inferSchema='True')
tickets_itr = tickets_df.rdd.toLocalIterator()
for row in tickets_itr:
  tickets_by_city.at[row.city, str(row.number_of_tickets)] += 1

print("Number of tickets per reservation count by city")
print(tickets_by_city)
print()
ax = tickets_by_city.plot.bar(rot=0)

## Explanations for the data analysis section

In this part of the project we have chosen to focus on finding insights mainly from the "queries" table as well as from the "tickets" table, and in particular those that will help us in the horizontal fregmentation, since the vertical fregmenataion is more easy to implement by code and the horizontal is more dependent on our previous knowladge about the database.

Since the general task in this part of the project is to divide the database between sites that are geographically close to the cities of Haifa, Tiberias, Tel Aviv, Jerusalem and Eilat, we found it appropriate to focus on the differences embodied in the database between the cities, hoping that if we find significant differences between cities, we would be able to generalize them for certain rows / columns in the database.

Indeed, we have found some interesting insights, as one can see from the above code output:

1. Jerusalemites and Tel-Avivians are almost the only ones to search for movies in Hebrew.
2. Jerusalemites care much about the playing actors, Haifa people care much about directors, and Eilaties alaways specify countries in their queries.
3. People who look for movies in Eilat, never look for movies in over places in the same query.
4. The numbers of tickets in reservations are more or less in the same amount and from the same distribution.


In [32]:
import sys
import numpy as np
from collections import Counter

def row_counter(my_array):
    # count identical rows in matrix
    list_of_tups = [tuple(ele) for ele in my_array]
    return Counter(list_of_tups)

# build use matrix for queries and shrink it
shape = (queries_df.count(), len(queries_df.columns))
queries_use = np.zeros(shape)
queries_itr = queries_df.rdd.toLocalIterator()
for i, row in enumerate(queries_itr):
  for j, col in enumerate(queries_df.columns):
    if type(row[col]) != list:
      queries_use[i, j] = 1
    else:
      queries_use[i, j] = 0 if row[col][0] == "" else 1

access = row_counter(queries_use)

for key, value in access.items():
    print(f"{key} : {value}")


(1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0) : 10046
(1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0) : 12416
(1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0) : 31305
(1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0) : 965
(1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0) : 21205
(1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0) : 12620
(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0) : 9983
(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) : 1308
(1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0) : 64
(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0) : 3
(1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0) : 3
(1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0) : 70
(1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0) : 5
(1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) : 4
(1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0) : 1
(1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0) : 2


In [47]:
n_queries = 8
n_attributes = 9
#attribute usage matrix
aum = [[1, 1, 1, 0, 1, 1, 0, 1, 1],  # 10046
       [1, 1, 1, 0, 0, 1, 0, 1, 1],  # 12416
       [1, 1, 1, 1, 0, 1, 1, 1, 1],  # 31305
       [1, 1, 1, 0, 1, 1, 1, 1, 1],  # 965
       [1, 1, 1, 0, 0, 1, 1, 1, 1],  # 21205
       [1, 1, 1, 1, 0, 1, 0, 1, 1],  # 12620
       [1, 1, 1, 1, 1, 1, 0, 1, 1],  # 9983
       [1, 1, 1, 1, 1, 1, 1, 1, 1]]  # 1308

#number of sites
n_sites = 5

#access matrix
acc = [[10046/100000, 10046/100000, 10046/100000, 10046/100000, 10046/100000],
       [12416/100000, 12416/100000, 12416/100000, 12416/100000, 12416/100000],
       [31305/100000, 31305/100000, 31305/100000, 31305/100000, 31305/100000],
       [965/100000, 965/100000, 965/100000, 965/100000, 965/100000],
       [21205/100000, 21205/100000, 21205/100000, 21205/100000, 21205/100000],
       [12620/100000, 12620/100000, 12620/100000, 12620/100000, 12620/100000],
       [9983/100000, 9983/100000, 9983/100000, 9983/100000, 9983/100000],
       [1308/100000, 1308/100000, 1308/100000, 1308/100000, 1308/100000]]

#prefix sum for each query
pre = [0 for i in range(n_queries)]
for i in range(n_queries):
    for j in range(n_sites):
        pre[i] = pre[i]+acc[i][j]

#attribute affinity matrix
aam = [[0 for i in range(n_attributes)] for j in range(n_attributes)]

#calculation of the aam
for i in range(n_attributes):
    for j in range(n_attributes):
         if(i==j):
             aam[i][j]=0
             continue
         for q in range(n_queries):
            if aum[q][i]==1 and aum[q][j]==1:
                aam[i][j] = aam[i][j]+pre[q]
          
print("Attribute affinity matrix")
for i in range(n_attributes):
    print(aam[i])
print("Access Site Sums")
print(pre)

Attribute affinity matrix
[0, 4.9924, 4.9924, 2.7608, 1.1151, 4.9924, 2.73915, 4.9924, 4.9924]
[4.9924, 0, 4.9924, 2.7608, 1.1151, 4.9924, 2.73915, 4.9924, 4.9924]
[4.9924, 4.9924, 0, 2.7608, 1.1151, 4.9924, 2.73915, 4.9924, 4.9924]
[2.7608, 2.7608, 2.7608, 0, 0.56455, 2.7608, 1.63065, 2.7608, 2.7608]
[1.1151, 1.1151, 1.1151, 0.56455, 0, 1.1151, 0.11365, 1.1151, 1.1151]
[4.9924, 4.9924, 4.9924, 2.7608, 1.1151, 0, 2.73915, 4.9924, 4.9924]
[2.73915, 2.73915, 2.73915, 1.63065, 0.11365, 2.73915, 0, 2.73915, 2.73915]
[4.9924, 4.9924, 4.9924, 2.7608, 1.1151, 4.9924, 2.73915, 0, 4.9924]
[4.9924, 4.9924, 4.9924, 2.7608, 1.1151, 4.9924, 2.73915, 4.9924, 0]
Access Site Sums
[0.5023, 0.6208, 1.56525, 0.04825, 1.06025, 0.631, 0.49915, 0.0654]


In [48]:
def bond(Ax,Ay):
    if Ax==-1 or Ay==-1:
        return 0
    ans = 0
    for i in range(n_attributes):
        ans = ans + (aam[i][Ax]*aam[i][Ay])
    return ans

def cont(Ai,Ak,Aj):
    print("bond ",Ai, "bond", Ak, " = ", bond(Ai,Ak))
    print("bond ",Ak, "bond", Aj, " = ", bond(Ak,Aj))
    print("bond ",Ai, "bond", Aj, " = ", bond(Ai,Aj))
    return 2*bond(Ai,Ak) + 2*bond(Ak,Aj) - 2*bond(Ai,Aj)

In [49]:
#Bond energy algorithm
def BEA():
    ca = []
    ca.append(0)
    ca.append(1)
    index  = 2
    while index < n_attributes:
        maxi = -1 
        maxc = -100000
        for i in range(1,index):
                con = cont(ca[i-1],index,ca[i])
                print("Index ", i+1, " ", "cont ", ca[i],index+1,ca[i]+1, con)
                if con > maxc:
                    maxi = i
                    maxc = con
        #boundary left
        con = cont(-1,index,ca[0])
        print("Index ", i+1, " ", "cont ", 1,index+1,ca[0]+1, con)
        if con > maxc:
            maxi = 0
            maxc = con
        #boundary right
        con = cont(ca[index-1],index,-1)
        print("Index ", i+1, " ", "cont ", ca[index-1]+1,index+1,index+2, con)
        if con > maxc:
            maxi = index
        if maxi==index:
            ca.append(index)    
        else:
            ca.append(0)
            for j in range(index,maxi,-1):
                ca[j]=ca[j-1]
            ca[maxi] = index
        print(ca)
        index = index + 1
    print("FINAL Clustered Affinity Matrix")
    print(ca)
    return ca

In [36]:
CA = BEA()
ca = [[0 for i in range(n_attributes)] for j in range(n_attributes)]
for i in range(n_attributes):
    for j in range(n_attributes):
        ca[i][j] = aam[CA[i]][CA[j]]

print(ca)

bond  0 bond 2  =  165.9127539325
bond  2 bond 1  =  165.9127539325
bond  0 bond 1  =  165.9127539325
Index  2   cont  1 3 2 331.825507865
bond  -1 bond 2  =  0
bond  2 bond 0  =  165.9127539325
bond  -1 bond 0  =  0
Index  2   cont  1 3 1 331.825507865
bond  1 bond 2  =  165.9127539325
bond  2 bond -1  =  0
bond  1 bond -1  =  0
Index  2   cont  2 3 4 331.825507865
[0, 2, 1]
bond  0 bond 3  =  95.41624881250002
bond  3 bond 2  =  95.41624881250002
bond  0 bond 2  =  165.9127539325
Index  2   cont  2 4 3 49.83948738500004
bond  2 bond 3  =  95.41624881250002
bond  3 bond 1  =  95.41624881250002
bond  2 bond 1  =  165.9127539325
Index  3   cont  1 4 2 49.83948738500004
bond  -1 bond 3  =  0
bond  3 bond 0  =  95.41624881250002
bond  -1 bond 0  =  0
Index  3   cont  1 4 1 190.83249762500003
bond  1 bond 3  =  95.41624881250002
bond  3 bond -1  =  0
bond  1 bond -1  =  0
Index  3   cont  2 4 5 190.83249762500003
[3, 0, 2, 1]
bond  3 bond 4  =  20.8448711975
bond  4 bond 0  =  36.515513487

In [37]:

def shift_row_aum(mat):
    row_first=[]
    for i in range(n_attributes):
        row_first.append(mat[0][i])
    for i in range(1,n_queries):
        for j in range(n_attributes):
            mat[i-1][j]=mat[i][j]
    for i in range(n_attributes):
        mat[n_queries-1][i]=row_first[i]
   # print(row_first)
    return mat
   
def shift_column_aum(mat):
    col_first=[]
    for i in range(n_queries):
        col_first.append(mat[i][0])
    for i in range(n_queries):
        for j in range(1,n_attributes):
            mat[i][j-1]=mat[i][j]
    for i in range(n_queries):
        mat[i][n_attributes-1]=col_first[i]
    return mat

In [38]:
def shift_row_ca(mat):
    row_first=[]
    for i in range(n_attributes):
        row_first.append(mat[0][i])
    for i in range(1,n_attributes):
        for j in range(n_attributes):
            mat[i-1][j]=mat[i][j]
    for i in range(n_attributes):
        mat[n_attributes-1][i]=row_first[i]
   # print(row_first)
    return mat
   
def shift_column_ca(mat):
    col_first=[]
    for i in range(n_attributes):
        col_first.append(mat[i][0])
    for i in range(n_attributes):
        for j in range(1,n_attributes):
            mat[i][j-1]=mat[i][j]
    for i in range(n_attributes):
        mat[i][n_attributes-1]=col_first[i]
    return mat

In [43]:
#Partioning
start=n_attributes-2
aum = [[1, 1, 1, 0, 1, 1, 0, 1, 1],  # 10046
       [1, 1, 1, 0, 0, 1, 0, 1, 1],  # 12416
       [1, 1, 1, 1, 0, 1, 1, 1, 1],  # 31305
       [1, 1, 1, 0, 1, 1, 1, 1, 1],  # 965
       [1, 1, 1, 0, 0, 1, 1, 1, 1],  # 21205
       [1, 1, 1, 1, 0, 1, 0, 1, 1],  # 12620
       [1, 1, 1, 1, 1, 1, 0, 1, 1],  # 9983
       [1, 1, 1, 1, 1, 1, 1, 1, 1]]  # 1308
AQ=[]
for i in range(n_queries):
    row=[]
    for j in range(n_attributes):
        if aum[i][j]==1:
            row.append(j)
    AQ.append(row)

print(AQ)

[[0, 1, 2, 4, 5, 7, 8], [0, 1, 2, 5, 7, 8], [0, 1, 2, 3, 5, 6, 7, 8], [0, 1, 2, 4, 5, 6, 7, 8], [0, 1, 2, 5, 6, 7, 8], [0, 1, 2, 3, 5, 7, 8], [0, 1, 2, 3, 4, 5, 7, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8]]


In [44]:
TQ=[]
BQ=[]
OQ=[]

for i in range(n_queries):
    if AQ[i][1] <= start:
        TQ.append(i)
    elif AQ[i][0] > start:
        BQ.append(i)
    else:
        OQ.append(i)

    
print(TQ)
print(BQ)
print(OQ)

[0, 1, 2, 3, 4, 5, 6, 7]
[]
[]


In [45]:
CTQ=0
CBQ=0
COQ=0

for i in range(len(TQ)):
    CTQ=CTQ+pre[TQ[i]]
for i in range(len(BQ)):
    CBQ=CBQ+pre[BQ[i]]
for i in range(len(OQ)):
    COQ=COQ+pre[OQ[i]]
best=CTQ*CBQ-COQ*COQ

In [46]:
shift=0
for i in range(n_sites):
    for j in range(n_attributes-3,0,-1):
        TQ=[]
        BQ=[]
        OQ=[]

        for k in range(n_sites):
            if AQ[k][1] <= j:
                TQ.append(i)
            elif AQ[k][0] > j:
                BQ.append(k)
            else:
                OQ.append(k)
        CTQ=0
        CBQ=0
        COQ=0
        print(TQ)
        for k in range(len(TQ)):
            CTQ=CTQ+pre[TQ[k]]
        for k in range(len(BQ)):
            CBQ=CBQ+pre[BQ[k]]
        for k in range(len(OQ)):
            COQ=COQ+pre[OQ[k]]
        z=CTQ*CBQ-COQ*COQ
        if z>best:
            best=z
            start=j
            shift=i
    shift_row_ca(ca)
    shift_column_ca(ca)
    shift_row_aum(aum)
    shift_column_aum(aum)
    AQ=[]
    for i in range(n_queries):
        row=[]
        for j in range(n_attributes):
            if aum[i][j]==1:
                row.append(j)
        AQ.append(row)
last=n_attributes-1
for i in range(shift):
    ele=CA[last]
    for j in range(last,1,-1):
        CA[j]=CA[j-1]
    CA[0]=ele
F1={1}
F2={1}
print("First Half")
for i in range(0,start):
    F1.add(CA[i]+1)
print(F1)    
print("Second Half")

for i in range(start,n_attributes):
    F2.add(CA[i]+1)
print(F2)  
print("Split is:")
print(start)
print("Shift is")
print(shift)

[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[1, 1, 1, 1, 1]
[2, 2, 2, 2, 2]
[2, 2, 2, 2, 2]
[2, 2, 2, 2, 2]
[2, 2, 2, 2, 2]
[2, 2, 2, 2]
[2, 2, 2]
[3, 3, 3, 3, 3]
[3, 3, 3, 3, 3]
[3, 3, 3, 3, 3]
[3, 3, 3, 3, 3]
[3, 3, 3, 3]
[3, 3]
[4, 4, 4, 4, 4]
[4, 4, 4, 4, 4]
[4, 4, 4, 4, 4]
[4, 4, 4, 4, 4]
[4, 4, 4, 4]
[4, 4, 4]
First Half
{1, 2, 3, 4, 6, 8, 9}
Second Half
{1, 5, 7}
Split is:
7
Shift is
0
