In [2]:
from pyspark.sql import SparkSession
import os

os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"

spark = SparkSession.builder \
    .appName("SearchEngine") \
    .getOrCreate()


In [9]:
from pyspark.sql.functions import col

paperDF = spark.read.csv('data/All_Data.csv', header=True, inferSchema=True)

paperDF = paperDF.withColumn("citedby_count", col("citedby_count").cast("int"))

paperDF.printSchema()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- language: string (nullable = true)
 |-- citedby_count: integer (nullable = true)
 |-- publisher: string (nullable = true)
 |-- subject_codes: string (nullable = true)
 |-- keywords: string (nullable = true)



In [10]:
from pyspark.sql.functions import concat_ws

# Combine fields into a single search_text column
df = paperDF.withColumn("search_text", concat_ws(" ", "title", "subject_codes", "abstract", "keywords"))
df.select("search_text").show(truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformers model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embedding(text):
    return model.encode(text).tolist()


In [12]:
# Convert Spark DataFrame to Pandas
df_pandas = df.toPandas()

# Apply the embedding function to the search_text column
df_pandas["embedding"] = df_pandas["search_text"].apply(generate_embedding)

In [13]:
import pandas as pd

# Create a DataFrame with embeddings
df_pandas["embedding"] = df_pandas["embedding"].apply(lambda x: ','.join(map(str, x)))

# Save to a CSV file
df_pandas.to_csv("data/embeddings.csv", index=False)

In [15]:
import numpy as np

# Function to parse the string and convert to float16
def parse_and_convert(embedding_str):
    # Convert the string into a list of floats
    embedding_list = list(map(float, embedding_str.split(',')))
    # Convert the list to float16
    return np.array(embedding_list, dtype=np.float16).tolist()

# Apply the conversion to the "embedding" column
df_pandas["embedding"] = df_pandas["embedding"].apply(parse_and_convert)

# Save the data to Parquet with Snappy compression
df_pandas.to_parquet("data_with_embeddings.parquet", compression="snappy", index=False)
