In [1]:
import findspark
findspark.init()
spark_url = 'local'
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, explode

spark = SparkSession.builder\
        .master(spark_url)\
        .appName('Spark Tutorial')\
        .config('spark.ui.port', '4040')\
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.driver.maxResultSize", "2g") \
        .getOrCreate()
sc = spark.sparkContext

In [2]:
import os

# Read all JSON files from the folder
folder_path = "data/2018"
json_files = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))

# Read each JSON file into a DataFrame and union them
paperDF = spark.read.json(json_files, multiLine=True)

In [3]:
paperDF.printSchema()

root
 |-- abstracts-retrieval-response: struct (nullable = true)
 |    |-- affiliation: string (nullable = true)
 |    |-- authkeywords: struct (nullable = true)
 |    |    |-- author-keyword: string (nullable = true)
 |    |-- authors: struct (nullable = true)
 |    |    |-- author: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- @_fa: string (nullable = true)
 |    |    |    |    |-- @auid: string (nullable = true)
 |    |    |    |    |-- @seq: string (nullable = true)
 |    |    |    |    |-- affiliation: string (nullable = true)
 |    |    |    |    |-- author-url: string (nullable = true)
 |    |    |    |    |-- ce:degrees: string (nullable = true)
 |    |    |    |    |-- ce:given-name: string (nullable = true)
 |    |    |    |    |-- ce:indexed-name: string (nullable = true)
 |    |    |    |    |-- ce:initials: string (nullable = true)
 |    |    |    |    |-- ce:suffix: string (nullable = true)
 |    |    |    |    

In [3]:
# Extract the abstract and the subject codes
abstract_subject_DF = paperDF.select(
    col("abstracts-retrieval-response.item.bibrecord.head.abstracts").alias("abstract"),
    explode(col("abstracts-retrieval-response.subject-areas.subject-area")).alias("subject_area")
)

# Select the abstract and the @code field from the exploded subject_area
abstract_subject_DF = abstract_subject_DF.select(
    col("abstract"),
    col("subject_area.@abbrev").alias("subject_code")
)

#abstract_subject_DF.show(5)

In [None]:
# num_rows = abstract_subject_DF.count()
# num_cols = len(abstract_subject_DF.columns)
# print(f"Shape of DataFrame: ({num_rows}, {num_cols})")

Shape of DataFrame: (6518, 2)


In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

# Tokenize the text
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
# Convert text to numerical features using TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")
# Index the labels
indexer = StringIndexer(inputCol="subject_code", outputCol="subject_code_index")

# Create a pipeline to execute the transformations
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, indexer])

# Fit and transform the data
preprocessed_DF = pipeline.fit(abstract_subject_DF).transform(abstract_subject_DF)

# Show the preprocessed data
preprocessed_DF.show(5)

Py4JJavaError: An error occurred while calling o2330.fit.
: java.lang.IllegalStateException: Haven't seen any document yet.
	at org.apache.spark.mllib.feature.IDF$DocumentFrequencyAggregator.idf(IDF.scala:135)
	at org.apache.spark.mllib.feature.IDF.fit(IDF.scala:55)
	at org.apache.spark.ml.feature.IDF.fit(IDF.scala:93)
	at org.apache.spark.ml.feature.IDF.fit(IDF.scala:69)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1623)


In [4]:
from pyspark.sql.functions import count

# Add a count column to the DataFrame
abstract_subject_count_DF = abstract_subject_DF.groupBy("subject_code").agg(count("*").alias("count"))

# Order by count in descending order and show all rows
abstract_subject_count_DF.orderBy(col("count").desc()).show(n=abstract_subject_count_DF.count(), truncate=False)

# Order by count in ascending order and show all rows
#abstract_subject_count_DF.orderBy(col("count").asc()).show(n=abstract_subject_count_DF.count(), truncate=False)

+------------+-----+
|subject_code|count|
+------------+-----+
|MEDI        |1053 |
|ENGI        |619  |
|COMP        |514  |
|BIOC        |505  |
|MATE        |445  |
|PHYS        |423  |
|CHEM        |386  |
|AGRI        |319  |
|ENVI        |299  |
|CENG        |235  |
|PHAR        |215  |
|SOCI        |214  |
|IMMU        |182  |
|ENER        |157  |
|EART        |129  |
|MATH        |126  |
|VETE        |116  |
|NEUR        |108  |
|BUSI        |100  |
|MULT        |84   |
|DENT        |62   |
|ECON        |47   |
|NURS        |44   |
|DECI        |42   |
|ARTS        |41   |
|PSYC        |27   |
|HEAL        |26   |
+------------+-----+



In [5]:
# from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier


# Split the data into training and test sets
trainDF, testDF = preprocessed_DF.randomSplit([0.8, 0.2], seed=42)

# Initialize the Random Forest model
rf = RandomForestClassifier(featuresCol='features', labelCol='subject_code_index', numTrees=100)

# Train the model
rf_model = rf.fit(trainDF)

# Make predictions on the test set
predictions = rf_model.transform(testDF)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol='subject_code_index', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy = {accuracy}")

NameError: name 'preprocessed_DF' is not defined

In [8]:
#spark.stop()

In [None]:
# TODO: REDO CLASSIFICATION, TRY DOING IN BATCHES INSTEAD

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

# Tokenize the text
tokenizer = Tokenizer(inputCol="abstract", outputCol="words")
# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
# Convert text to numerical features using TF-IDF
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=10000)
idf = IDF(inputCol="raw_features", outputCol="features")
# Index the labels
indexer = StringIndexer(inputCol="subject_code", outputCol="subject_code_index")

# Create a pipeline to execute the transformations
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, indexer])

# Filter out rows with null values in the abstract column
abstract_subject_DF = abstract_subject_DF.filter(col("abstract").isNotNull())

# Fit and transform the data
preprocessed_DF = pipeline.fit(abstract_subject_DF).transform(abstract_subject_DF)

# Show the preprocessed data
preprocessed_DF.show(5)

+--------------------+------------+--------------------+--------------------+--------------------+--------------------+------------------+
|            abstract|subject_code|               words|      filtered_words|        raw_features|            features|subject_code_index|
+--------------------+------------+--------------------+--------------------+--------------------+--------------------+------------------+
|© 2018, CERN for ...|        ENGI|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(10000,[274,307,3...|(10000,[274,307,3...|               1.0|
|© 2018, CERN for ...|        PHYS|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(10000,[274,307,3...|(10000,[274,307,3...|               5.0|
|© CERN, for the b...|        PHYS|[©, cern,, for, t...|[©, cern,, benefi...|(10000,[157,688,8...|(10000,[157,688,8...|               5.0|
|© 2018, CERN for ...|        ENGI|[©, 2018,, cern, ...|[©, 2018,, cern, ...|(10000,[45,264,31...|(10000,[45,264,31...|               1.0|
|© 2018, CERN for ...|     

In [11]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Create a parameter grid for tuning
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [100, 200, 300]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.maxBins, [32, 64]) \
    .build()

# Create a CrossValidator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainDF)

# Make predictions on the test set
predictions = cvModel.transform(testDF)

# Evaluate the model
accuracy = evaluator.evaluate(predictions)
print(f"Test set accuracy after tuning = {accuracy}")

KeyboardInterrupt: 

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Initialize the KMeans model
kmeans = KMeans(featuresCol='features', k=5, seed=42)

# Train the model
kmeans_model = kmeans.fit(preprocessed_DF)

# Make predictions
kmeans_predictions = kmeans_model.transform(preprocessed_DF)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator(featuresCol='features', metricName='silhouette', distanceMeasure='squaredEuclidean')
silhouette = evaluator.evaluate(kmeans_predictions)
print(f"Silhouette with squared Euclidean distance = {silhouette}")

# Show the cluster centers
centers = kmeans_model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)