In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from pymongo.mongo_client import MongoClient
from scipy.spatial.distance import cosine
from pandas import json_normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from scipy import sparse
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix

import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, array, struct
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [2]:
from dotenv import load_dotenv
import requests
import os

In [3]:
load_dotenv()
MONGO_CONNECTION_STRING = os.getenv("MONGO_CONNECTION_STRING")

## pull data from mongo

In [4]:
def connect_to_mongodb():
    ''' set up mongodb atlas connection'''
    
    uri = "mongodb+srv://rsnagarkar:Pallavi10@cluster1.wgyjd.mongodb.net"
    client = MongoClient(uri)

    # send a ping to confirm a successful connection
    try:
        client.admin.command('ping')
        print("Pinged your deployment. Connection successful!")
    except Exception as e:
        print(e)

    db = client.medium_database
    return db

In [5]:
db = connect_to_mongodb()

Pinged your deployment. Connection successful!


In [6]:
def get_data(db):
    '''fetch data from server'''
    writer_data = pd.DataFrame(list(db.writer_data.find({})))
    follower_data = pd.DataFrame(list(db.followers_data.find({})))
    article_data = pd.DataFrame(list(db.articles_data.find({})))
    print(f'fetched {len(writer_data)}+{len(follower_data)}+{len(article_data)} records')
    return writer_data, follower_data, article_data

In [7]:
writers_raw, followers, articles = get_data(db)
# Remove the _id field as it's not needed for analysis
# del followers['_id']

fetched 22+68398+2618 records


In [21]:
# extract articles content
articles['content'] = articles['content'].apply(lambda x: x['content'])

'How I Won Singapore’s GPT-4 Prompt Engineering Competition\n\nA deep dive into the strategies I learned for harnessing the power of Large Language Models (LLMs)\n\nCelebrating a milestone - The real win was the priceless learning experience!\n\nLast month, I had the incredible honor of winning Singapore’s first ever GPT-4 Prompt Engineering competition, which brought together over 400 prompt-ly brilliant participants, organised by the Government Technology Agency of Singapore (GovTech).\n\nPrompt engineering is a discipline that blends both art and science - it is as much technical understanding as it is of creativity and strategic thinking. This is a compilation of the prompt engineering strategies I learned along the way, that push any LLM to do exactly what you need and more!\n\nAuthor’s Note:\nIn writing this, I sought to steer away from the traditional prompt engineering techniques that have already been extensively discussed and documented online. Instead, my aim is to bring fre

## compute articles similar to a top article

In [8]:
# Initiate SparkSession
spark = SparkSession.builder.appName("SimilarityApp").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/01 12:39:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [19]:
# Step 1: Vectorize 'article content' using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01, max_df=0.8)
tfidf_matrix = tfidf_vectorizer.fit_transform(articles['content'].fillna(''))
tfidf_matrix

<2618x5355 sparse matrix of type '<class 'numpy.float64'>'
	with 811580 stored elements in Compressed Sparse Row format>

In [20]:
# step 2: computer similarity scores for all articles
top_article_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
top_article_similarity.round(3)

array([[1.   , 0.085, 0.045, ..., 0.047, 0.029, 0.173],
       [0.085, 1.   , 0.109, ..., 0.03 , 0.026, 0.044],
       [0.045, 0.109, 1.   , ..., 0.027, 0.019, 0.019],
       ...,
       [0.047, 0.03 , 0.027, ..., 1.   , 0.569, 0.332],
       [0.029, 0.026, 0.019, ..., 0.569, 1.   , 0.214],
       [0.173, 0.044, 0.019, ..., 0.332, 0.214, 1.   ]])

In [21]:
# modified get_similar_articles() by including scores

# Step 3: Recommend Similar articles
def get_similar_articles_with_scores(article_id, top_n=3):
    article_idx = articles.index[articles['article_id'] == article_id].tolist()[0]
    sim_scores = list(enumerate(top_article_similarity[article_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude the article itself
    similar_articles = [(articles['article_id'].iloc[i[0]], i[1]) for i in sim_scores]
    return similar_articles

In [22]:
# Iterate over all articles to get similar articles along with similarity scores
similar_articles_data = []
for article_id in articles['article_id']:
    similar_articles = get_similar_articles_with_scores(article_id)
    for sim_article_id, score in similar_articles:
        similar_articles_data.append((article_id, sim_article_id, float(score)))

# Define schema for the DataFrame
schema = StructType([
    StructField("article_id", StringType(), True),
    StructField("similar_article_id", StringType(), True),
    StructField("similarity_score", FloatType(), True)
])

similar_articles_data[:6]

[('34c195a93d41', 'a3c1ed861830', 0.38378177402087593),
 ('34c195a93d41', '62d1b61c6f02', 0.37921165381479305),
 ('34c195a93d41', '6ad21c4cfa99', 0.3661479627085197),
 ('8c339f8fb602', '29a7a970358f', 0.35649059059794547),
 ('8c339f8fb602', '444693429eea', 0.28352116024085955),
 ('8c339f8fb602', 'e30a01dc67f8', 0.2800230883355387)]

In [23]:
# Convert the list to a Spark DataFrame
similar_articles_df = spark.createDataFrame(similar_articles_data, schema=schema)

# Now you can use SparkSQL to query this DataFrame
similar_articles_df.createOrReplaceTempView("similar_articles")

In [24]:
# Example usage
article_id = '34c195a93d41'  # Replace with the article ID you're interested in
query = f"""
SELECT similar_article_id, similarity_score
FROM similar_articles
WHERE article_id = '{article_id}'
ORDER BY similarity_score DESC
LIMIT 3
"""
top_similar_articles = spark.sql(query)
top_similar_articles.show()


                                                                                

+------------------+----------------+
|similar_article_id|similarity_score|
+------------------+----------------+
|      a3c1ed861830|      0.38378176|
|      62d1b61c6f02|      0.37921166|
|      6ad21c4cfa99|      0.36614797|
+------------------+----------------+



In [25]:
similar_articles_df

DataFrame[article_id: string, similar_article_id: string, similarity_score: float]

## Compute similar writers

In [26]:
writers = writers_raw.copy(deep=True)

In [27]:
writers.head(3).T

Unnamed: 0,0,1,2
_id,65e232c3b92836486ca7c091,65e232c3b92836486ca7c092,65e232c3b92836486ca7c093
id,fca9db1c7da0,e10ad955760c,76398be9016
username,sheilateozy,nikhiladithyan,machine-learning-made-simple
fullname,Sheila Teo,Nikhil Adithyan,Devansh
bio,"Data Scientist, https://www.linkedin.com/in/sh...",Founder @BacktestZone (https://www.backtestzon...,"Writing about AI, Math, the Tech Industry and ..."
followers_count,2111,7044,13172
following_count,16,41,21
publication_following_count,2,11,2
image_url,https://miro.medium.com/1*UmlZGQsNhuv9kgQL6pFs...,https://miro.medium.com/1*fiFn4AhPBi-CG-cKxHk2...,https://miro.medium.com/1*xiFRgHfgfMR7S111UB2h...
twitter_username,,,Machine01776819


In [28]:
def concatenate_article_info(top_articles):
    """
    Concatenate information from top_articles for TF-IDF vectorization.

    Parameters:
    - top_articles: List of dictionaries, each representing an article.

    Returns:
    - concatenated_info: String, concatenated information of all articles.
    """
    concatenated_info = ""

    for article in top_articles:
        # Extract information from each article
        title = article.get('title', '')
        subtitle = article.get('subtitle', '')
        tags = ' '.join(article.get('tags', []))  # Convert list of tags to a space-separated string
        topics = ' '.join(article.get('topics', []))  # Convert list of topics to a space-separated string
        top_highlight = article.get('top_highlight', '')

        # Concatenate article information, separated by spaces
        article_info = f"{title} {subtitle} {tags} {topics} {top_highlight}"
        concatenated_info += " " + article_info  # Add to the overall concatenated string

    return concatenated_info.strip()  # Remove any leading/trailing whitespace

# Assuming 'top_articles' is a column in your DataFrame 'df' that contains the list of article dictionaries
writers['concatenated_info'] = writers['top_articles'].apply(concatenate_article_info)
writers['concatenated_info'] = writers['concatenated_info'] +" "+ writers['bio']

In [29]:
writers['concatenated_info'].iloc[0]

'How I Won Singapore’s GPT-4 Prompt Engineering Competition A deep dive into the strategies I learned for harnessing the power of Large Language Models (LLMs) data-science artificial-intelligence prompt-engineering editors-pick technology artificial-intelligence data-science Use System Prompts to provide instructions that you want the LLM to remember when responding throughout the entire chat. Stacked Ensembles for Advanced Predictive Modeling With H2O.ai and Optuna And how I placed top 10% in Europe’s largest machine learning competition with them! machine-learning data-science deep-learning ensemble-learning python machine-learning data-science Data Scientist, https://www.linkedin.com/in/sheila-teo/'

In [30]:
def compute_averages_and_proportions(top_articles):
    # Initialize sums and counts for each metric
    sums = {
        'claps': 0,
        'voters': 0,
        'word_count': 0,
        'responses_count': 0,
        'reading_time': 0,
        'is_series_count': 0,
        'is_shortform_count': 0
    }
    count = len(top_articles)  # Number of articles to average over
    
    for article in top_articles:
        # Sum up each metric
        sums['claps'] += article.get('claps', 0)
        sums['voters'] += article.get('voters', 0)
        sums['word_count'] += article.get('word_count', 0)
        sums['responses_count'] += article.get('responses_count', 0)
        sums['reading_time'] += article.get('reading_time', 0)
    
    # Calculate averages and proportions
    averages_and_proportions = {
        'avg_claps': sums['claps'] / count if count else 0,
        'avg_voters': sums['voters'] / count if count else 0,
        'avg_word_count': sums['word_count'] / count if count else 0,
        'avg_responses_count': sums['responses_count'] / count if count else 0,
        'avg_reading_time': sums['reading_time'] / count if count else 0,
    }
    
    return averages_and_proportions

# Apply the function to each row in the DataFrame and expand the results into new columns
for column, default in [('avg_claps', 0), ('avg_voters', 0), ('avg_word_count', 0), 
                        ('avg_responses_count', 0), ('avg_reading_time', 0)]:
    writers[column] = writers['top_articles'].apply(lambda x: compute_averages_and_proportions(x).get(column, default))

In [31]:
# Assuming `df` is your DataFrame
writers = writers.drop(columns=['_id', 'bio', 'following_count',
                      'publication_following_count', 'image_url', 'twitter_username', 'is_writer_program_enrolled',
                      'allow_notes', 'medium_member_at', 'is_suspended', 'top_writer_in', 'has_list',
                      'tipping_link', 'bg_image_url', 'logo_image_url', 'top_articles'])
writers.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
id,fca9db1c7da0,e10ad955760c,76398be9016,d80580992695,8a910484fe84,4beacba7dc8a,630ab5ffdf27,b856005e5ecd,b0fbe613be9d,14176fcb5743,...,8c8e5b7182ef,37a2cbe8bd15,15a29a4fc6ad,fb44e21903f3,c24a3d106811,c3aeaf49d8a4,b1a64eb107f0,c4a298b66f16,86f03cf61226,aff72a0c1243
username,sheilateozy,nikhiladithyan,machine-learning-made-simple,anmol3015,moneytent,jordan_gibbs,jacobistyping,fareedkhandev,cobusgreyling,inchristiely,...,iampaulrose,pareto_investor,miptgirl,frank-andrade,cristianleo120,netflixtechblog,evertongomede,austin-starks,amankharwal,sh-tsang
fullname,Sheila Teo,Nikhil Adithyan,Devansh,Anmol Tomar,Money Tent,Jordan Gibbs,Jacob Bennett,Fareed Khan,Cobus Greyling,Christie C.,...,Paul Rose,The Pareto Investor,Mariya Mansurova,The PyCoach,Cristian Leo,Netflix Technology Blog,"Everton Gomede, PhD",Austin Starks,Aman Kharwal,Sik-Ho Tsang
followers_count,2111,7044,13172,19854,3516,2825,25716,20638,16025,56048,...,40019,38872,7465,135695,5804,404081,13191,4106,21174,11817
is_book_author,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,True,False,True,False
concatenated_info,How I Won Singapore’s GPT-4 Prompt Engineering...,Create a Stock Chatbot with your own CSV Data ...,Understanding Google’s GPT Killer- The Pathway...,"Don’t use loc/iloc with Loops In Python, Inste...",Want to be Rich? DON’T Start a Side Hustle. Wh...,"Forget Prompt Engineering, ChatGPT Can Write P...",My magical first job as a self-taught software...,100x Faster — Scaling Your RAG App for Billion...,"Demonstrate, Search, Predict (DSP) for LLMs Th...",Midjourney V6 New Prompting Technique — Introd...,...,I Built The 5 Income Streams Every Writer Shou...,Why You Should Pay Attention to Perplexity AI ...,Text Embeddings: Comprehensive Guide Evolution...,You’re Not The Only One Feeling AI Fatigue (Or...,Reinforcement Learning 101: Q-Learning Decodin...,Announcing bpftop: Streamlining eBPF performan...,Recommended Books🧐🆕(update at 2024–02–29) I wi...,"I am Aurora, the Most Powerful AI Financial As...",85+ Data Science Projects You Can Try with Pyt...,"Summary: My Paper Reading Lists, Tutorials & S..."
avg_claps,5934.0,680.5,256.3,998.6,43.6,1201.8,1732.2,715.0,180.1,856.6,...,1266.9,890.3,778.6,825.6,609.9,696.1,43.9,138.9,327.7,158.3
avg_voters,1202.0,161.9,48.8,276.9,5.9,213.7,462.0,143.5,19.4,108.7,...,148.6,147.4,168.1,149.2,152.7,189.8,6.1,14.8,113.6,26.1
avg_word_count,4352.5,2127.1,2117.3,891.9,787.8,1247.1,969.9,3552.4,1018.5,1393.1,...,1878.6,829.4,4636.3,1007.3,4090.7,1636.3,1274.5,1370.4,662.6,859.0
avg_responses_count,84.0,10.0,2.1,13.3,0.9,17.8,21.1,8.5,0.6,10.8,...,29.8,16.4,10.8,11.7,5.7,9.9,0.1,1.4,3.0,1.4


In [32]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# Fit and transform the 'concatenated_info' column
tfidf_writer_matrix = tfidf_vectorizer.fit_transform(writers['concatenated_info'])

#### now encode the followers

In [48]:
all_writers = writers['id']
all_writers

0     fca9db1c7da0
1     e10ad955760c
2      76398be9016
3     d80580992695
4     8a910484fe84
5     4beacba7dc8a
6     630ab5ffdf27
7     b856005e5ecd
8     b0fbe613be9d
9     14176fcb5743
10    9b351e8113e9
11    5d33decdf4c4
12    8c8e5b7182ef
13    37a2cbe8bd15
14    15a29a4fc6ad
15    fb44e21903f3
16    c24a3d106811
17    c3aeaf49d8a4
18    b1a64eb107f0
19    c4a298b66f16
20    86f03cf61226
21    aff72a0c1243
Name: id, dtype: object

In [56]:
# Create a mapping of writer IDs to column indices for efficient lookup
writer_to_index = {writer: index for index, writer in enumerate(all_writers)}

# Initialize a list to hold the data for the sparse matrix
rows, cols, data = [], [], []

# Iterate over the DataFrame to populate rows, cols, and data
for row_idx, writer_ids in enumerate(followers['writer_ids']):
    for writer_id in writer_ids:
        # Check if the writer_id is in the mapping
        if writer_id in writer_to_index:
            col_idx = writer_to_index[writer_id]
            rows.append(row_idx)
            cols.append(col_idx)
            data.append(1)  # This is a binary matrix, so we just set the value to 1

# Create the sparse matrix directly from the rows, cols, and data
writer_sparse_matrix = sparse.csr_matrix((data, (rows, cols)), shape=(len(followers), len(all_writers)))
writer_sparse_matrix

<68398x22 sparse matrix of type '<class 'numpy.int64'>'
	with 99647 stored elements in Compressed Sparse Row format>

In [60]:
follower_sparse_matrix = writer_sparse_matrix.transpose()
follower_sparse_matrix

<22x68398 sparse matrix of type '<class 'numpy.int64'>'
	with 99647 stored elements in Compressed Sparse Column format>

In [62]:
# Convert 'is_book_author' from boolean to int
writers['is_book_author'] = writers['is_book_author'].astype(int)

# Select numerical features
numerical_features = writers[['avg_claps', 'avg_voters', 'avg_word_count', 'avg_responses_count', 
                              'avg_reading_time', 'followers_count', 'is_book_author']].values

# Normalize numerical features
normalized_numerical_features = normalize(numerical_features, axis=0)

# Convert to sparse matrix
numerical_features_sparse = csr_matrix(normalized_numerical_features)

# Assuming 'tfidf_matrix' is the TF-IDF sparse matrix and 'followers_matrix' is the followers sparse matrix
final_matrix = hstack([tfidf_writer_matrix, follower_sparse_matrix, numerical_features_sparse])

In [63]:
# Compute similarity scores
writer_similarity_scores = cosine_similarity(final_matrix, final_matrix)

writer_similarity_scores.round(3)

array([[1.   , 0.016, 0.017, 0.016, 0.031, 0.097, 0.01 , 0.015, 0.024,
        0.007, 0.008, 0.015, 0.009, 0.007, 0.033, 0.005, 0.02 , 0.001,
        0.01 , 0.028, 0.005, 0.011],
       [0.016, 1.   , 0.091, 0.121, 0.023, 0.046, 0.111, 0.054, 0.061,
        0.022, 0.036, 0.067, 0.027, 0.056, 0.147, 0.023, 0.17 , 0.003,
        0.148, 0.114, 0.013, 0.038],
       [0.017, 0.091, 1.   , 0.123, 0.016, 0.036, 0.046, 0.114, 0.135,
        0.028, 0.05 , 0.161, 0.047, 0.052, 0.081, 0.05 , 0.048, 0.009,
        0.135, 0.07 , 0.048, 0.091],
       [0.016, 0.121, 0.123, 1.   , 0.029, 0.038, 0.15 , 0.272, 0.102,
        0.082, 0.12 , 0.203, 0.075, 0.149, 0.109, 0.061, 0.069, 0.009,
        0.211, 0.041, 0.065, 0.051],
       [0.031, 0.023, 0.016, 0.029, 1.   , 0.069, 0.007, 0.035, 0.029,
        0.016, 0.02 , 0.01 , 0.028, 0.02 , 0.047, 0.009, 0.013, 0.002,
        0.037, 0.054, 0.002, 0.009],
       [0.097, 0.046, 0.036, 0.038, 0.069, 1.   , 0.025, 0.044, 0.046,
        0.024, 0.024, 0.028, 0.024

In [64]:
def find_top_similar_writers_with_scores(writer_id, writers_df, similarity_scores, top_n=3):
    # Find the index of this writer in the DataFrame
    writer_index = writers_df.index[writers_df['id'] == writer_id].tolist()[0]

    # Get the similarity scores for this writer against all others
    writer_similarity_scores = similarity_scores[writer_index]

    # Sort the writers based on similarity scores in descending order, get top n+1 scores (including the writer itself)
    sorted_indices_and_scores = sorted(enumerate(writer_similarity_scores), key=lambda x: x[1], reverse=True)[:top_n+1]

    # Filter out the writer itself and limit to top n
    top_similar = [(writers_df.iloc[i]['id'], score) for i, score in sorted_indices_and_scores if i != writer_index][:top_n]

    return top_similar

# Example usage
similar_writers = find_top_similar_writers_with_scores('fca9db1c7da0', writers, writer_similarity_scores)
print(f"Top similar writers and their similarity scores: {similar_writers}")


Top similar writers and their similarity scores: [('4beacba7dc8a', 0.09675307019166673), ('15a29a4fc6ad', 0.03282717683032892), ('8a910484fe84', 0.031212698617862553)]


In [65]:
# Iterate over all writers to get similar writers along with similarity scores
similar_writers_data = []
for writer_id in writers['id']:
    similar_writers = find_top_similar_writers_with_scores(writer_id, writers, writer_similarity_scores)
    for sim_writer_id, score in similar_writers:
        similar_writers_data.append((writer_id, sim_writer_id, float(score)))

# Define schema for the DataFrame
schema = StructType([
    StructField("writer_id", StringType(), True),
    StructField("similar_writer_id", StringType(), True),
    StructField("similarity_score", FloatType(), True)
])

similar_writers_data[:6]

[('fca9db1c7da0', '4beacba7dc8a', 0.09675307019166673),
 ('fca9db1c7da0', '15a29a4fc6ad', 0.03282717683032892),
 ('fca9db1c7da0', '8a910484fe84', 0.031212698617862553),
 ('e10ad955760c', 'c24a3d106811', 0.1696063630601724),
 ('e10ad955760c', 'b1a64eb107f0', 0.14755092545717924),
 ('e10ad955760c', '15a29a4fc6ad', 0.14693765859730573)]

In [66]:
# Convert the list to a Spark DataFrame
similar_writers_df = spark.createDataFrame(similar_writers_data, schema=schema)

# Now you can use SparkSQL to query this DataFrame
similar_writers_df.createOrReplaceTempView("similar_writers")

In [67]:
# Example usage
writer_id = '630ab5ffdf27'  # Replace with the article ID you're interested in
query = f"""
SELECT similar_writer_id, similarity_score
FROM similar_writers
WHERE writer_id = '{writer_id}'
ORDER BY similarity_score DESC
LIMIT 3
"""
top_similar_writers = spark.sql(query)
top_similar_writers.show()

+-----------------+----------------+
|similar_writer_id|similarity_score|
+-----------------+----------------+
|     d80580992695|      0.14995342|
|     37a2cbe8bd15|      0.11766463|
|     9b351e8113e9|      0.11608657|
+-----------------+----------------+



## recommend writers to followers based on other follower behavior

In [78]:
# Convert the nested dictionary into a list of tuples (follower, writer)
data_list = followers.apply(lambda row: (row['follower_id'], row['writer_ids']), axis=1).tolist()

# Create a DataFrame from the list
df = pd.DataFrame(data_list, columns=['Follower', 'Writer'])

df = df.explode('Writer').reset_index(drop=True)

# Create a binary matrix
binary_matrix = df.pivot_table(index='Follower', columns='Writer', aggfunc='size', fill_value=0)

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(binary_matrix)

# Convert the similarity matrix into a DataFrame for easier readability
similarity_df = pd.DataFrame(similarity_matrix, index=binary_matrix.index, columns=binary_matrix.index)

  ret = a @ b


In [79]:
def recommend_writers_for_follower(follower_id, binary_matrix, similarity_df, top_n=3):
    # Find the top N most similar followers
    most_similar_followers = similarity_df[follower_id].sort_values(ascending=False).index[1:top_n+1]
    
    # Aggregate the writers followed by these similar followers
    recommended_writers = binary_matrix.loc[most_similar_followers].sum().sort_values(ascending=False)
    
    # Exclude writers the target follower already follows
    already_followed = binary_matrix.loc[follower_id]
    recommended_writers = recommended_writers[already_followed == 0]
    
    return recommended_writers.head(top_n).index.tolist()

# Example usage:
follower_id = '00008456a9f0'  # Replace with the actual follower ID
recommendations = recommend_writers_for_follower(follower_id, binary_matrix, similarity_df)
print(f"Recommended writers for {follower_id}: {recommendations}")

Recommended writers for 00008456a9f0: ['15a29a4fc6ad', '37a2cbe8bd15', '4beacba7dc8a']


In [83]:
# Prepare data for the DataFrame
recommendations_data = []
for follower_id in binary_matrix.index:  # Assuming index contains follower IDs
    recommended_writers = recommend_writers_for_follower(follower_id, binary_matrix, similarity_df)
    for writer_id in recommended_writers:
        recommendations_data.append((follower_id, writer_id))

# Define schema for the DataFrame
schema = StructType([
    StructField("follower_id", StringType(), True),
    StructField("recommended_writer_id", StringType(), True)
])

In [84]:
# Convert the list to a Spark DataFrame
recommendations_df = spark.createDataFrame(recommendations_data, schema=schema)

# Now you can use SparkSQL to query this DataFrame
recommendations_df.createOrReplaceTempView("writer_recommendations")

In [85]:
# Example SparkSQL query to display recommended writers for a given follower_id
query_follower_id = '00008456a9f0'  # Replace with the follower ID you're interested in
query = f"""
SELECT recommended_writer_id
FROM writer_recommendations
WHERE follower_id = '{query_follower_id}'
"""
follower_recommendations = spark.sql(query)
follower_recommendations.show()

                                                                                

+---------------------+
|recommended_writer_id|
+---------------------+
|         15a29a4fc6ad|
|         37a2cbe8bd15|
|         4beacba7dc8a|
+---------------------+

