In [1]:
import findspark
findspark.init()
spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, explode

spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.driver.maxResultSize", "2g") \
        .getOrCreate()
sc = spark.sparkContext

In [2]:
import os

path = "data/2018/201800003.json"
paperDF = spark.read.json(path , multiLine=True)
# Read all JSON files from the folder
folder_path = "data/2018/"
json_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.json')]

# Read each JSON file into a DataFrame and union them
paperDF = spark.read.json(json_files, multiLine=True)

In [3]:
paperDF.printSchema()

root
 |-- abstracts-retrieval-response: struct (nullable = true)
 |    |-- affiliation: string (nullable = true)
 |    |-- authkeywords: struct (nullable = true)
 |    |    |-- author-keyword: string (nullable = true)
 |    |-- authors: struct (nullable = true)
 |    |    |-- author: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- @_fa: string (nullable = true)
 |    |    |    |    |-- @auid: string (nullable = true)
 |    |    |    |    |-- @seq: string (nullable = true)
 |    |    |    |    |-- affiliation: string (nullable = true)
 |    |    |    |    |-- author-url: string (nullable = true)
 |    |    |    |    |-- ce:degrees: string (nullable = true)
 |    |    |    |    |-- ce:given-name: string (nullable = true)
 |    |    |    |    |-- ce:indexed-name: string (nullable = true)
 |    |    |    |    |-- ce:initials: string (nullable = true)
 |    |    |    |    |-- ce:suffix: string (nullable = true)
 |    |    |    |    

In [4]:
# Extract the abstract and the subject codes
abstract_subject_DF = paperDF.select(
    col("abstracts-retrieval-response.item.bibrecord.head.abstracts").alias("abstract"),
    explode(col("abstracts-retrieval-response.subject-areas.subject-area")).alias("subject_area")
)

# Select the abstract and the @code field from the exploded subject_area
abstract_subject_DF = abstract_subject_DF.select(
    col("abstract"),
    col("subject_area.@code").alias("subject_code")
)

abstract_subject_DF.show(5)

+--------------------+------------+
|            abstract|subject_code|
+--------------------+------------+
|© 2018, CERN for ...|        2201|
|© 2018, CERN for ...|        3101|
|© CERN, for the b...|        3106|
|© 2018, CERN for ...|        2201|
|© 2018, CERN for ...|        3101|
+--------------------+------------+
only showing top 5 rows



In [None]:
# num_rows = abstract_subject_DF.count()
# num_cols = len(abstract_subject_DF.columns)
# print(f"Shape of DataFrame: ({num_rows}, {num_cols})")

Shape of DataFrame: (6518, 2)


In [6]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml import Pipeline

# Drop rows with null values
abstract_subject_DF = abstract_subject_DF.dropna()

# Tokenize the abstract column
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Convert text to vector of token counts
vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="features")

# Create a pipeline to execute the transformations
pipeline = Pipeline(stages=[tokenizer, remover, vectorizer])

# Fit and transform the data
model = pipeline.fit(abstract_subject_DF)
preprocessed_DF = model.transform(abstract_subject_DF)

preprocessed_DF.show(5)

+--------------------+------------+--------------------+--------------------+--------------------+
|            abstract|subject_code|               words|      filtered_words|            features|
+--------------------+------------+--------------------+--------------------+--------------------+
|© 2018, CERN for ...|        2201|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(62400,[0,1,6,15,...|
|© 2018, CERN for ...|        3101|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(62400,[0,1,6,15,...|
|© CERN, for the b...|        3106|[©, cern,, for, t...|[©, cern,, benefi...|(62400,[0,1,6,19,...|
|© 2018, CERN for ...|        2201|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(62400,[0,5,6,14,...|
|© 2018, CERN for ...|        3101|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(62400,[0,5,6,14,...|
+--------------------+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer

# Convert subject_code to numeric
indexer = StringIndexer(inputCol="subject_code", outputCol="subject_code_index")
indexed_DF = indexer.fit(preprocessed_DF).transform(preprocessed_DF)

# Split the data into training and test sets
trainDF, testDF = indexed_DF.randomSplit([0.8, 0.2], seed=42)

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='subject_code_index', maxIter=10, regParam=0.1, elasticNetParam=0.5)

# Train the model
lr_model = lr.fit(trainDF)

# Make predictions on the test set
predictions = lr_model.transform(testDF)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='subject_code_index', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy = {accuracy}")

In [8]:
#spark.stop()

In [None]:
# TODO: REDO CLASSIFICATION, TRY DOING IN BATCHES INSTEAD