<a href="https://colab.research.google.com/github/KevinYih/BigDataDemo/blob/main/DDB_project_by_Kevin.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Collaborative Filtering

source: https://colab.research.google.com/drive/1UWeDiyXiwDDqe7ksN2kt-myHsuSLObv8?usp=sharing

Run the code, project_false_example

In [1]:
import time

class Transaction:
    """
    Represents a transaction with a unique ID, timestamp, and lists of read and write operations.
    """
    def __init__(self, transaction_id, timestamp):
        self.transaction_id = transaction_id  # Unique identifier for the transaction
        self.timestamp = timestamp  # Timestamp when the transaction is created
        self.read_operations = []  # List to hold read operations
        self.write_operations = []  # List to hold write operations
        self.active = True  # Status of the transaction (active or aborted)

    def add_read(self, data_item):
        """
        Add a read operation to the transaction.
        :param data_item: The data item to be read
        """
        self.read_operations.append(data_item)

    def add_write(self, data_item):
        """
        Add a write operation to the transaction.
        :param data_item: The data item to be written
        """
        self.write_operations.append(data_item)

class DataItem:
    """
    Represents a data item with read and write timestamps and a value.
    """
    def __init__(self, name):
        self.name = name  # Name of the data item
        self.read_ts = 0  # Latest read timestamp
        self.write_ts = 0  # Latest write timestamp
        self.value = None  # Value of the data item

class TransactionManager:
    """
    Manages transactions and data items, providing methods to create and handle them.
    """
    def __init__(self):
        self.transactions = {}  # Dictionary to hold transactions
        self.data_items = {}  # Dictionary to hold data items

    def create_transaction(self, transaction_id):
        """
        Create a new transaction.
        :param transaction_id: Unique identifier for the transaction
        """
        timestamp = time.time()  # Generate a timestamp for the transaction
        self.transactions[transaction_id] = Transaction(transaction_id, timestamp)
        print(f"Transaction {transaction_id} created with timestamp {timestamp}")

    def create_data_item(self, name):
        """
        Create a new data item.
        :param name: Name of the data item
        """
        self.data_items[name] = DataItem(name)
        print(f"Data item {name} created")

    def read(self, transaction_id, data_item_name):
        """
        Perform a read operation on a data item.
        :param transaction_id: ID of the transaction performing the read
        :param data_item_name: Name of the data item to be read
        """
        transaction = self.transactions[transaction_id]
        data_item = self.data_items[data_item_name]

        # Check timestamp ordering protocol rules for read operation
        if transaction.timestamp < data_item.write_ts:
            # Abort the transaction if it violates the timestamp ordering rule
            print(f"Transaction {transaction_id} is aborted. Read timestamp {data_item.write_ts} is newer than transaction timestamp {transaction.timestamp}.")
            transaction.active = False
        else:
            # Update the read timestamp of the data item
            data_item.read_ts = max(data_item.read_ts, transaction.timestamp)
            print(f"Transaction {transaction_id} read {data_item_name} successfully at timestamp {transaction.timestamp}")

    def write(self, transaction_id, data_item_name, value):
        """
        Perform a write operation on a data item.
        :param transaction_id: ID of the transaction performing the write
        :param data_item_name: Name of the data item to be written
        :param value: Value to be written to the data item
        """
        transaction = self.transactions[transaction_id]
        data_item = self.data_items[data_item_name]

        # Check timestamp ordering protocol rules for write operation
        if transaction.timestamp < data_item.read_ts or transaction.timestamp < data_item.write_ts:
            # Abort the transaction if it violates the timestamp ordering rule
            print(f"Transaction {transaction_id} is aborted. Conflict with existing timestamps (read_ts: {data_item.read_ts}, write_ts: {data_item.write_ts}).")
            transaction.active = False
        else:
            # Update the write timestamp and value of the data item
            data_item.write_ts = transaction.timestamp
            data_item.value = value
            print(f"Transaction {transaction_id} wrote {data_item_name} successfully at timestamp {transaction.timestamp}")

# Sample usage
if __name__ == "__main__":
    tm = TransactionManager()

    # Create data items
    tm.create_data_item("A")

    # Create transactions
    tm.create_transaction("T1")
    tm.create_transaction("T2")
    tm.create_transaction("T3")

    # Add operations to transactions
    tm.transactions["T2"].add_write("A")
    tm.transactions["T2"].add_read("A")
    tm.transactions["T1"].add_write("A")

    # Process operations
    tm.write("T1", "A", "value1")
    tm.write("T2", "A", "value1")

    # Simulate some delay to ensure different timestamps
    time.sleep(1)

    # This should cause a conflict and result in an aborted transaction
    tm.read("T1", "A")
    tm.write("T1", "A", "value2")

    # Check if transactions are still active
    print(f"Transaction T1 active: {tm.transactions['T1'].active}")
    print(f"Transaction T2 active: {tm.transactions['T2'].active}")


Data item A created
Transaction T1 created with timestamp 1720722406.687843
Transaction T2 created with timestamp 1720722406.6878858
Transaction T3 created with timestamp 1720722406.6879063
Transaction T1 wrote A successfully at timestamp 1720722406.687843
Transaction T2 wrote A successfully at timestamp 1720722406.6878858
Transaction T1 is aborted. Read timestamp 1720722406.6878858 is newer than transaction timestamp 1720722406.687843.
Transaction T1 is aborted. Conflict with existing timestamps (read_ts: 0, write_ts: 1720722406.6878858).
Transaction T1 active: False
Transaction T2 active: True


Run the code, project_true_example

In [2]:
import time

class Transaction:

    # Represents a transaction with a unique ID, timestamp,
    # and lists of read and write operations
    def __init__(self, transaction_id, timestamp):
        self.transaction_id = transaction_id
        self.timestamp = timestamp
        self.read_operations = []
        self.write_operations = []
        self.active = True

    #Add a read operation to the transaction
    def add_read(self, data_item):
        self.read_operations.append(data_item)

    #Add a write operation to the transaction
    def add_write(self, data_item):
        self.write_operations.append(data_item)

# Represents a data item with read and write timestamps and a value
class DataItem:
    def __init__(self, name):
        self.name = name
        self.read_ts = 0
        self.write_ts = 0
        self.value = None

class TransactionManager:
    def __init__(self):
        self.transactions = {}
        self.data_items = {}

    def create_transaction(self, transaction_id):
        timestamp = time.time()
        self.transactions[transaction_id] = Transaction(transaction_id, timestamp)
        print(f"Transaction {transaction_id} created with timestamp {timestamp}")

    def create_data_item(self, name):
        self.data_items[name] = DataItem(name)
        print(f"Data item {name} created")

    def read(self, transaction_id, data_item_name):
        transaction = self.transactions[transaction_id]
        data_item = self.data_items[data_item_name]

         # Check timestamp ordering protocol rules for read operation
        if transaction.timestamp < data_item.write_ts:
            # Abort the transaction if it violates the timestamp ordering rule
            print(f"Transaction {transaction_id} is aborted. Read timestamp {data_item.write_ts} is newer than transaction timestamp {transaction.timestamp}.")
            transaction.active = False
        else:
            # Update the read timestamp of the data item
            data_item.read_ts = max(data_item.read_ts, transaction.timestamp)
            print(f"Transaction {transaction_id} read {data_item_name} successfully at timestamp {transaction.timestamp}")

    #Perform a write operation on a data item.
    def write(self, transaction_id, data_item_name, value):
        transaction = self.transactions[transaction_id]
        data_item = self.data_items[data_item_name]

        # Check timestamp ordering protocol rules for write operation
        # Abort the transaction if it violates the timestamp ordering rule
        if transaction.timestamp < data_item.read_ts or transaction.timestamp < data_item.write_ts:
            print(f"Transaction {transaction_id} is aborted. Conflict with existing timestamps (read_ts: {data_item.read_ts}, write_ts: {data_item.write_ts}).")
            transaction.active = False
        else:
            # Update the write timestamp and value of the data item
            data_item.write_ts = transaction.timestamp
            data_item.value = value
            print(f"Transaction {transaction_id} wrote {data_item_name} successfully at timestamp {transaction.timestamp}")

# Sample usage
if __name__ == "__main__":
    tm = TransactionManager()

    # Create data items
    tm.create_data_item("A")
    tm.create_data_item("B")

    # Create transactions
    tm.create_transaction("T1")
    tm.create_transaction("T2")

    # Add operations to transactions
    tm.transactions["T1"].add_read("A")
    tm.transactions["T1"].add_write("A")
    tm.transactions["T2"].add_write("B")

    # Process operations
    tm.read("T1", "A")
    tm.write("T1", "A", "value1")
    tm.write("T2", "B", "value2")

    # Check if transactions are still active
    print(f"Transaction T1 active: {tm.transactions['T1'].active}")
    print(f"Transaction T2 active: {tm.transactions['T2'].active}")


Data item A created
Data item B created
Transaction T1 created with timestamp 1720722495.313091
Transaction T2 created with timestamp 1720722495.3131309
Transaction T1 read A successfully at timestamp 1720722495.313091
Transaction T1 wrote A successfully at timestamp 1720722495.313091
Transaction T2 wrote B successfully at timestamp 1720722495.3131309
Transaction T1 active: True
Transaction T2 active: True


Now we authenticate a Google Drive client to download the files we will be processing in our Spark job.

**Make sure to follow the interactive instructions.**