In [None]:
from pyspark.sql import SparkSession
import os

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"

spark = SparkSession.builder \
    .appName("SearchEngine") \
    .getOrCreate()


In [None]:
from pyspark.sql.functions import col

paperDF = spark.read.csv('data/data_cleaned.csv', header=True, inferSchema=True)

paperDF = paperDF.withColumn("citedby_count", col("citedby_count").cast("int"))

paperDF.printSchema()

In [None]:
from pyspark.sql.functions import concat_ws

# Combine fields into a single search_text column
df = paperDF.withColumn("search_text", concat_ws(" ", "title", "subject_codes", "abstract", "keywords"))
df.select("search_text").show(truncate=False)


In [None]:
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformers model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embedding(text):
    return model.encode(text).tolist()

In [None]:
# Convert Spark DataFrame to Pandas
df_pandas = df.toPandas()

# Apply the embedding function to the search_text column
df_pandas["embedding"] = df_pandas["search_text"].apply(generate_embedding)

In [None]:
import numpy as np

# Function to convert list to float16
def convert_to_float16(embedding_list):
    # Convert the list to float16
    return np.array(embedding_list, dtype=np.float16).tolist()

# Apply the conversion to the "embedding" column
df_pandas["embedding"] = df_pandas["embedding"].apply(convert_to_float16)

# Save the data to Parquet with Snappy compression
df_pandas.to_parquet("data/data_with_embeddings.parquet", compression="snappy", index=False)
