In [88]:
import pandas as pd
import gzip
import json
from Code.UtilityFunctions.get_data_path import get_path
from Code.UtilityFunctions.get_iri import get_iri
from RecommendationSplits.split_yelp_nt_files import get_positive_users, format_values_string, batch_get_reviews, timewise_stratified_split

import os
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])

from Code.UtilityFunctions.run_query import run_query

In [80]:
users_with_pos_reviews = get_positive_users(num_reviews_threshold=10, rating_threshold=3.0)

user_batch_cache = set()  # Create an empty set to store the user URIs in batches
reviews_list = list()  # Create an empty list to store the resulting dataframes for reviews

for row in users_with_pos_reviews.itertuples():
    user = row[1]  # Get the user URI from the current row of users_with_pos_reviews dataframe
    user_batch_cache.add(user)  # Add the user URI to the user batch cache

    if len(user_batch_cache) % 20 == 0:
        # If the user batch cache contains 20 user URIs, format them into a string for the VALUES statement in the query
        user_batch = format_values_string(_input=user_batch_cache)
        user_batch_cache.clear()  # Prepare the user batch cache for the next 20 users

        # Query the graph for reviews, businesses, ratings, and timestamps for the current batch of users
        review_batch = batch_get_reviews(user_batch=user_batch)
        reviews_list.append(review_batch)  # Append the resulting dataframe to the list of reviews

if len(user_batch_cache) > 0:
    # If the last batch is less than 20, we still need to retrieve reviews, businesses, ratings, and timestamps
    user_batch = format_values_string(_input=user_batch_cache)
    review_batch = batch_get_reviews(user_batch=user_batch)
    reviews_list.append(review_batch)  # Append the resulting dataframe to the list of reviews

reviews_df = pd.concat(reviews_list, ignore_index=True)  # Concatenate all the resulting dataframes into a single dataframe

In [81]:
t = timewise_stratified_split(data=reviews_df, class_column="user.value", time_column="timestamp.value",
                              train_size=0.6, val_size=0.2, test_size=0.2) 

In [82]:
train, val, test = t[0], t[1], t[2]

### Size Check

In [83]:
# Size check - do the train-validation-test sets use the entirety of the original data?
print(len(t[0]), len(t[1]), len(t[2]), len(t[0]) + len(t[1]) + len(t[2]), len(reviews_df))
len(t[0]) + len(t[1]) + len(t[2]) == len(reviews_df), len(reviews_df) - (len(t[0]) + len(t[1]) + len(t[2]))

1733862 555682 658398 2947942 2947942


(True, 0)

### User Check

In [84]:
# User check - are all users still present in each of the splits?
print(len(set(t[0]['user.value'])), len(set(t[1]['user.value'])), len(set(t[2]['user.value'])), len(set(reviews_df['user.value'])))
len(set(t[0]['user.value'])) == len(set(t[1]['user.value'])) == len(set(t[2]['user.value'])) == len(set(reviews_df['user.value']))

88060 88060 88060 88060


True

### Leakage Check

In [85]:
def check_duplicate_rows(*dfs):
    """
    Check if two Pandas DataFrames have any rows that are the same using hash values.

    Args:
        dfs (pd.DataFrame): .

    Returns:
        bool: True if any rows are the same, False otherwise.
    """
    sets = [set(df.itertuples(index=False, name=None)) for df in dfs]
    
    # Use set intersection to find the common rows
    common_rows = set.intersection(*sets)

    if len(common_rows) > 0:
        return True
    else:
        return False

In [89]:
# Leakage check - are there any shared rows in the train-val-test partitions?
not check_duplicate_rows(t[0], t[1], t[2])

True

### Basic statistic of splits

In [87]:
average_reviews_train = train.groupby('user.value').size().mean()
average_reviews_val = val.groupby('user.value').size().mean()
average_reviews_test = test.groupby('user.value').size().mean()

print(f"Average number of reviews per user in the train set: {average_reviews_train}")
print(f"Average number of reviews per user in the val set: {average_reviews_val}")
print(f"Average number of reviews per user in the test set: {average_reviews_test}")


Average number of reviews per user in the train set: 19.68955257778787
Average number of reviews per user in the val set: 6.310265727912787
Average number of reviews per user in the test set: 7.476697706109471


## Tips TODO

In [None]:
def batch_get_tips(user_batch: str) -> pd.DataFrame:
    """
    Retrieves tips for a batch of users based on a given user batch string.

    Args:
    user_batch (str): A string containing the batch of user URIs to retrieve tips for.

    Returns:
    pd.DataFrame: A Pandas DataFrame containing the tips for the users in the batch.
    """
    # Define the SPARQL query to retrieve reviews for the given user batch
    get_tips_query = f"""
    SELECT ?user ?tip ?business ?timestamp
    WHERE {{
        ?tip schema:author ?user .
        ?tip rdfs:Class yelpont:Tip .
        ?tip schema:about ?business .
        ?tip schema:dateCreated ?timestamp .

        VALUES ?user {{ {user_batch} }} .
    }}
    """

    # Run the SPARQL query and retrieve results as a Pandas DataFrame
    tips_df = run_query(query=get_tips_query, as_dataframe=True)

    # Return the DataFrame with the reviews for the users in the batch
    return tips_df

In [28]:
users_with_pos_reviews = get_positive_users(num_reviews_threshold=10, rating_threshold=3.0)

user_batch_cache = set()  # Create an empty set to store the user URIs in batches
tips_list = list()  # Create an empty list to store the resulting dataframes for reviews

for row in users_with_pos_reviews.itertuples():
    user = row[1]  # Get the user URI from the current row of users_with_pos_reviews dataframe
    user_batch_cache.add(user)  # Add the user URI to the user batch cache

    if len(user_batch_cache) % 20 == 0:
        # If the user batch cache contains 20 user URIs, format them into a string for the VALUES statement in the query
        user_batch = format_values_string(_input=user_batch_cache)
        user_batch_cache.clear()  # Prepare the user batch cache for the next 20 users

        # Query the graph for reviews, businesses, ratings, and timestamps for the current batch of users
        tip_batch = batch_get_tips(user_batch=user_batch)
        tips_list.append(tip_batch)  # Append the resulting dataframe to the list of reviews

if len(user_batch_cache) > 0:
    # If the last batch is less than 20, we still need to retrieve reviews, businesses, ratings, and timestamps
    user_batch = format_values_string(_input=user_batch_cache)
    tip_batch = batch_get_tips(user_batch=user_batch)
    tips_list.append(tip_batch)  # Append the resulting dataframe to the list of reviews

tips_df = pd.concat(tips_list, ignore_index=True)  # Concatenate all the resulting dataframes into a single dataframe
tips_df

Unnamed: 0,user.value,tip.value,business.value,timestamp.value
0,https://purl.archive.org/purl/yelp/yelp_entiti...,N584425f5227f4a5b83bd881a7adfd834,https://purl.archive.org/purl/yelp/yelp_entiti...,2017-07-21T16:34:52
1,https://purl.archive.org/purl/yelp/yelp_entiti...,N0006fd40c07a405dbaa90e5b3b690fdd,https://purl.archive.org/purl/yelp/yelp_entiti...,2016-10-20T00:29:57
2,https://purl.archive.org/purl/yelp/yelp_entiti...,N4992fd97b9704ec5ac116944160d4379,https://purl.archive.org/purl/yelp/yelp_entiti...,2013-02-28T14:16:54
3,https://purl.archive.org/purl/yelp/yelp_entiti...,Nd38b90a18c10453d847cfda996dc2ae5,https://purl.archive.org/purl/yelp/yelp_entiti...,2013-03-09T01:59:44
4,https://purl.archive.org/purl/yelp/yelp_entiti...,Nf8186b70d5184387ba1cb2857709847c,https://purl.archive.org/purl/yelp/yelp_entiti...,2013-03-09T02:26:41
...,...,...,...,...
427805,https://purl.archive.org/purl/yelp/yelp_entiti...,Nb5e9c5697cee4d1a858e282eb1dcea35,https://purl.archive.org/purl/yelp/yelp_entiti...,2013-09-06T19:04:24
427806,https://purl.archive.org/purl/yelp/yelp_entiti...,N0a1d01329d1b4b7b86f4914ebdaab83a,https://purl.archive.org/purl/yelp/yelp_entiti...,2015-03-22T16:58:39
427807,https://purl.archive.org/purl/yelp/yelp_entiti...,N25ba95c4f02a42c5860152bc8b41e6ff,https://purl.archive.org/purl/yelp/yelp_entiti...,2015-03-22T16:55:15
427808,https://purl.archive.org/purl/yelp/yelp_entiti...,N449151ddeb3d4ae885e3642195375b93,https://purl.archive.org/purl/yelp/yelp_entiti...,2015-03-22T23:30:04


In [29]:
tips_split = timewise_stratified_split(data=tips_df, class_column="user.value", time_column="timestamp.value",
                              train_size=0.6, val_size=0.2, test_size=0.2)

tip_train, tip_val, tip_test = tips_split[0], tips_split[1], tips_split[2]

In [30]:
tip_train

Unnamed: 0,user.value,tip.value,business.value,timestamp.value
23182,https://purl.archive.org/purl/yelp/yelp_entiti...,N5d3087c739714f8b8c126113447e63bb,https://purl.archive.org/purl/yelp/yelp_entiti...,2011-04-04 23:23:35
15305,https://purl.archive.org/purl/yelp/yelp_entiti...,Na67dba1a01e3497f9d0778211011aa63,https://purl.archive.org/purl/yelp/yelp_entiti...,2011-01-13 23:06:48
15439,https://purl.archive.org/purl/yelp/yelp_entiti...,N6cc919545fac4ae8af37567e7a8c954e,https://purl.archive.org/purl/yelp/yelp_entiti...,2011-01-15 13:21:43
16816,https://purl.archive.org/purl/yelp/yelp_entiti...,Nc0510e6ae68e4ac19c8c26e673a748c9,https://purl.archive.org/purl/yelp/yelp_entiti...,2011-01-30 22:57:50
16817,https://purl.archive.org/purl/yelp/yelp_entiti...,N3e1a67290ab8441e888f3189493d1628,https://purl.archive.org/purl/yelp/yelp_entiti...,2011-01-30 23:00:05
...,...,...,...,...
232202,https://purl.archive.org/purl/yelp/yelp_entiti...,Nb26badd4f40f4868b5107ed44dcbaf9e,https://purl.archive.org/purl/yelp/yelp_entiti...,2015-01-04 21:17:50
388461,https://purl.archive.org/purl/yelp/yelp_entiti...,N58d8689768c94072bd3661914be7f0de,https://purl.archive.org/purl/yelp/yelp_entiti...,2019-01-01 21:12:20
388470,https://purl.archive.org/purl/yelp/yelp_entiti...,Ncd0f0acda54f49bd84f249f882dbf25b,https://purl.archive.org/purl/yelp/yelp_entiti...,2019-01-01 22:24:37
242523,https://purl.archive.org/purl/yelp/yelp_entiti...,N5ead11e1907d4849877a0241069f4359,https://purl.archive.org/purl/yelp/yelp_entiti...,2015-03-28 23:37:12


In [71]:
def create_tip_split_files(df, stage):

    input_path = "/home/ubuntu/vol1/OneDrive/DVML-P7/Data/KG-triple-files/yelp_tip.nt.gz"
    output_path = f"/home/ubuntu/vol1/OneDrive/DVML-P7/Data/KG-triple-files/yelp_tip_{stage}.nt.gz"
    maxiter = 0

    with gzip.open(input_path, 'rb') as input_file, gzip.open(output_path, 'wb') as output_file:
        for line in input_file:
            tip = line.decode('utf-8').split(' ')[0][1:-1]
            maxiter += 1
            if maxiter >= 940000:
                break
            if tip not in df['tip.value'].unique():
                continue
            else:
                output_file.write(line)

In [72]:
create_tip_split_files(df=tip_train, stage='train')

KeyboardInterrupt: 

In [56]:
with gzip.open("/home/ubuntu/vol1/OneDrive/DVML-P7/Data/KG-triple-files/yelp_tip_train.nt.gz", 'rb') as f:
    for line in f:
        print(line)
        