In [41]:
import os
os.environ["JAVA_HOME"] = "/home/WARK/jdk-11.0.25+9"  

from pyspark.sql import SparkSession

# Initialize Spark session with increased memory
spark = SparkSession.builder \
    .appName("WARK Data Pipeline - Predict Subject Area") \
    .master("local[*]") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate()


# Load CSV data into a Spark DataFrame
data_path = "/home/WARK/DSDE_Final_Project_WARK/Final/joined_2017-2023.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Show the data schema
df.printSchema()
df.show(5)

root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- authors_with_location_department: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- classifications: string (nullable = true)
 |-- subject_area_name: string (nullable = true)
 |-- subject_area_code: string (nullable = true)
 |-- date: string (nullable = true)
 |-- citedby_count: string (nullable = true)
 |-- category: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------------------+--------------------+---------------+--------------------+-----------------+----------+-------------+--------------------+
|      citation_title|           abstracts|             authors|authors_with_location_department|        affiliations|classifications|   subject_area_name|subject_area_code|      date|citedby_count|            category|
+--------------------+--------------------+-----------

In [42]:
# Drop the redundant column
# df = df.drop('authors')
df = df.drop('authors_with_location_department')
# df = df.drop('affiliations')
df = df.drop('classifications')
# df = df.drop('subject_area_name')
# df = df.drop('subject_area_code')
df = df.drop('date')
df = df.drop('citedby_count')

# Show the data schema
df.printSchema()
df.show(5)


root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- subject_area_name: string (nullable = true)
 |-- subject_area_code: string (nullable = true)
 |-- category: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|      citation_title|           abstracts|             authors|        affiliations|   subject_area_name|subject_area_code|            category|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|Mödruvallabók, AM...|The ultimate goal...|Arkel-de Leeuw va...|                   -|                   -|                -|               Other|
|  Energy and society|                   -|                   -|                   -|                 

In [43]:
from pyspark.sql.functions import col, concat_ws
%pip install setuptools

# Replace null or missing values
df_cleaned = df.fillna({
    'citation_title': '-',
    'abstracts': '-',
    'authors': '-',
    'affiliations': '-',
    'subject_area_name' : '-',
    'subject_area_code' : '-',
    'category': '-',
})

# Combine relevant fields into a single feature
df_cleaned = df_cleaned.withColumn(
    "features_combined", 
    concat_ws("||", col("citation_title"), col("abstracts"), col("authors"), col('affiliations'), col("subject_area_name"), col("subject_area_code"))
)

# Show the combined features
df_cleaned.select("features_combined", "category").show(5, truncate=False)


# Ensure no null values in required columns
df_cleaned = df_cleaned.fillna({"features_combined": "-", "category": "-"})

Note: you may need to restart the kernel to use updated packages.
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [44]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col



# Tokenize the combined features
tokenizer = Tokenizer(inputCol="features_combined", outputCol="tokens")

# Convert tokens into numerical features
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=30000)

# Encode target labels into numerical format
label_indexer = StringIndexer(inputCol="category", outputCol="label")

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline(stages=[tokenizer, vectorizer, label_indexer])

# Fit and transform the data
preprocessed_data = preprocessing_pipeline.fit(df_cleaned).transform(df_cleaned)

# Verify the transformed data
preprocessed_data.select("features", "label").show(5)



                                                                                

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(30000,[0,1,2,5,7...|  1.0|
|(30000,[2,121],[1...|  1.0|
|(30000,[0,1,2,5,7...|  5.0|
|(30000,[0,1,2,5,7...|  1.0|
|(30000,[0,1,2,5,7...|  1.0|
+--------------------+-----+
only showing top 5 rows



In [45]:
# Check if features and label columns are present
preprocessed_data.select("features", "label").show(5)

# Split the dataset into training and testing sets
train_data, test_data = preprocessed_data.randomSplit([0.7, 0.3], seed=39)

# # Sample a fraction of the data
# train_data = train_data.sample(fraction=1, seed=42)
# test_data = test_data.sample(fraction=1, seed=42)


print(f"Training Data Count: {train_data.count()}, Test Data Count: {test_data.count()}")


+--------------------+-----+
|            features|label|
+--------------------+-----+
|(30000,[0,1,2,5,7...|  1.0|
|(30000,[2,121],[1...|  1.0|
|(30000,[0,1,2,5,7...|  5.0|
|(30000,[0,1,2,5,7...|  1.0|
|(30000,[0,1,2,5,7...|  1.0|
+--------------------+-----+
only showing top 5 rows





Training Data Count: 15042, Test Data Count: 6286


                                                                                

In [46]:
from pyspark.ml.classification import LogisticRegression

# Initialize the logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=40)

# Train the model
lr_model = lr.fit(train_data)

# Evaluate the model on the test data
predictions = lr_model.transform(test_data)

# Display predictions
predictions.select("features", "label", "prediction").show(5)


24/12/10 08:54:01 WARN DAGScheduler: Broadcasting large task binary with size 162.0 MiB
[Stage 578:>                                                        (0 + 1) / 1]

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(30000,[0,1,2,5,7...| 45.0|       1.0|
|(30000,[19236],[1...|  1.0|       1.0|
|(30000,[19236],[1...|  1.0|       1.0|
|(30000,[19236],[1...|  1.0|       1.0|
|(30000,[19236],[1...|  1.0|       1.0|
+--------------------+-----+----------+
only showing top 5 rows



                                                                                

In [47]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")


24/12/10 08:54:03 WARN DAGScheduler: Broadcasting large task binary with size 162.0 MiB
                                                                                

Accuracy: 0.70
