In [None]:
%pyspark
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import BucketedRandomProjectionLSH

# -----------------------------------------------------
# 1. Generate Similar Users (LSH)
# -----------------------------------------------------
# Assuming you've already created user_factors DataFrame
brp = BucketedRandomProjectionLSH(
    inputCol="features",
    outputCol="hashes",
    bucketLength=2.0,
    numHashTables=3
)
lsh_model = brp.fit(user_factors)

# Calculate similar users (THIS CREATES similar_users)
similar_users = lsh_model.approxSimilarityJoin(
    user_factors, 
    user_factors, 
    threshold=1.5, 
    distCol="distance"
)

# -----------------------------------------------------
# 2. Create User Pairs
# -----------------------------------------------------
user_pairs = similar_users.select(
    F.col("datasetA.id").alias("user1"),
    F.col("datasetB.id").alias("user2"),
    (1 - F.col("distance")).alias("similarity")
).filter(F.col("similarity") > 0.2)

# -----------------------------------------------------
# 3. Get Top Recommendations
# -----------------------------------------------------
window = Window.partitionBy("user1").orderBy(F.desc("similarity"))

top_friend_recs = user_pairs \
    .withColumn("rank", F.row_number().over(window)) \
    .filter(F.col("rank") <= 5) \
    .drop("rank")

top_friend_recs.show(10)
#By khadija

In [None]:

%pyspark
# Get users and their reviewed businesses
user_reviews = review.select(
    col("user_index").alias("user"),
    col("business_index").alias("business")
)

# Find common businesses between recommended pairs
common_businesses = top_friend_recs.join(
    user_reviews.alias("u1"), 
    col("user1") == col("u1.user")
).join(
    user_reviews.alias("u2"), 
    (col("user2") == col("u2.user")) & 
    (col("u1.business") == col("u2.business"))
).groupBy("user1", "user2").agg(
    count("*").alias("shared_businesses")
)


# Show users with the most shared interests
common_businesses.orderBy(col("shared_businesses").desc()).show()

#BY AYA 


In [11]:
%sh
pip install pyarrow==14.0.0  # Install specific version

In [None]:
%pyspark
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import os

def generate_explanation(user1_id, user2_id):
    # Initialize OpenAI client inside UDF (fresh for each task)
    from openai import OpenAI
    client = OpenAI(
        base_url="https://xiaoai.plus/v1",
        api_key=os.environ["OPENAI_API_KEY"]  # Store key in Zeppelin environment
    )
    
    # Fetch user data safely
    user1_reviews = review.where(F.col("user_index") == user1_id).limit(3).collect()
    user2_reviews = review.where(F.col("user_index") == user2_id).limit(3).collect()
    
    # Build prompt
    prompt = f"Explain why user {user1_id} and {user2_id} should connect based on shared interests."
    
    # API call with error handling
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            timeout=10  # Prevent hanging
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Could not generate explanation: {str(e)}"

# Register UDF
explanation_udf = F.udf(generate_explanation, StringType())


#by AYA 

In [13]:
%sh
# Install networkx and matplotlib in the Spark/Python environment
pip install networkx matplotlib