In [1]:
from collections import defaultdict

from pyspark.sql.session import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    lead, lag, unix_timestamp, 
    when, col, sum, udf, avg
)
from pyspark.sql.types import StringType
import pyspark

from delta import *
from delta.tables import *

import json
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Word2Vec
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import PCA
from pyspark.sql.functions import col, lower, regexp_replace
import re
from pyspark.sql.types import FloatType
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import lit
import numpy as np



builder = (
    SparkSession.builder.appName("Refr")
    .config("spark.sql.extensions", 
            "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", 
            "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Data discovery

In [2]:
# Schema
data = spark.read.format("json").option("inferSchema","true").load("dblp-ref-0.json")
schema = data.schema

for i in range(1, 4):
    file_name = f"dblp-ref-{i}.json"
    df = spark.read.json(file_name, schema=schema)
    data = data.union(df)

data.printSchema()
data.show(5)
data.describe().show()



root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|The purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|Preliminary Desig...|international con...|2013|
|This paper descri...

Remove non-English titles

In [3]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType

data = data.na.drop(subset=["title"])

# Ascii
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

# UDF
is_ascii_udf = udf(is_ascii, BooleanType())


# Filter
df_eng = data.filter(is_ascii_udf(data['title']))

#df_eng.show(5)



Remove stop words and lower case

In [4]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Lower
df_eng = df_eng.withColumn("title", lower(col("title")))

tokenizer = Tokenizer(inputCol="title", outputCol="words")
df_eng = tokenizer.transform(df_eng)

# Word list
custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure','rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']

# Stop words
stop_words = StopWordsRemover().getStopWords()

stop_words = stop_words + custom_stop_words

# StopWordRemover
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)

df_eng = remover.transform(df_eng)


Regex

In [5]:
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql.functions import lower, col
from pyspark.sql.functions import concat_ws

# toString
df_eng = df_eng.withColumn("filtered", concat_ws(" ", col("filtered")))

# Regex
pattern = "[!()-\[\]{};:'\"\,<>./?@#$%^&*_~]"

# Regex transform
tokenizer = RegexTokenizer(inputCol="filtered", outputCol="low_filtered", pattern=pattern, gaps=True)
df_eng = tokenizer.transform(df_eng)




In [6]:
#df_eng.select("low_filtered").show(25)

Word2Vec

In [7]:
from pyspark.ml.feature import Word2Vec

df_eng = df_eng.repartition(10)

word2Vec = Word2Vec(vectorSize=2, minCount=5, inputCol="low_filtered", outputCol="result")

# Fit the model
model = word2Vec.fit(df_eng)

# Transform the DataFrame
result = model.transform(df_eng)


In [8]:
# Find synonyms
synonyms = model.findSynonyms('psychology', 5)
for word, cosine_distance in synonyms.collect():
    print("{}: {}".format(word, cosine_distance))

johnson: 1.0
s f: 0.9999999403953552
cp : 0.9999998807907104
spur: 0.9999998807907104
singly: 0.9999998807907104


Elbow method

In [9]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import PCA

# Apply PCA
pca = PCA(k=2, inputCol="result", outputCol="pcaFeatures")
model = pca.fit(result)
result = model.transform(result)

# Apply KMeans
kmeans = KMeans(k=3, seed=1, featuresCol="pcaFeatures")  
model = kmeans.fit(result.select('pcaFeatures'))
predictions = model.transform(result)


Search engine

In [10]:
# Define a UDF to compute cosine similarity
def cosine_similarity(v1, v2):
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

cosine_similarity_udf = udf(cosine_similarity, FloatType())

# Get the result of the filter operation
filtered_result = result.filter(result.title == 'crowdsourcing as lego: unpacking the building blocks of crowdsourcing collaboration processes').select('result').first()

# Check if filtered_result is not None before accessing its elements
if filtered_result is not None:
    input_vector = filtered_result[0].toArray().tolist()  # Convert DenseVector to list
else:
    print("No rows found")


# Calculate cosine similarity with all other papers
result = result.withColumn('similarity', cosine_similarity_udf('result', lit(input_vector)))

# Recommend the top N papers
recommendations = result.orderBy('similarity', ascending=False).limit(50)


In [11]:
recommendations.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|               words|            filtered|        low_filtered|              result|         pcaFeatures|similarity|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                NULL|[Helga Naessens, ...|139f0297-bc9c-46a...|        50|                NULL|generating hasse ...|                    |1999|[generating, hass...|generating hasse ...|[generating hasse...|[0.04697725549

# Debug

In [None]:
from pyspark.sql.types import FloatType
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import lit

# Define a UDF to compute cosine similarity
def cosine_similarity(v1, v2):
    return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))

cosine_similarity_udf = udf(cosine_similarity, FloatType())

# Get the unique titles in the DataFrame
unique_titles = result.select('title').distinct().rdd.flatMap(lambda x: x).collect()

# Iterate over the unique titles
for title in unique_titles:
    # Get the result of the filter operation
    filtered_result = result.filter(result.title == title).select('result').first()

    # Check if filtered_result is not None before accessing its elements
    if filtered_result is not None:
        input_vector = Vectors.dense(filtered_result[0])  # Convert numpy array to PySpark Vector
        print(f"Found a non-empty result for title: {title}")
        break
    else:
        print(f"No rows found for title: {title}")

# Check the dimensionality of the vectors
vector_length = len(result.select('result').first()[0])
input_vector_length = len(input_vector)

if vector_length != input_vector_length:
    print(f"Vector dimensions do not match: input_vector has {input_vector_length} dimensions, but vectors in 'result' column have {vector_length} dimensions.")
else:
    # Calculate cosine similarity with all other papers
    result = result.withColumn('similarity', cosine_similarity_udf('result', lit(input_vector.tolist())))  # Convert DenseVector to list

    # Recommend the top N papers
    recommendations = result.orderBy('similarity', ascending=False).limit(5)

In [None]:
unique_titles = result.select('id').distinct()
unique_titles.show()
