In [1]:
%pip install pyspark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pyspark.sql import SparkSession

# Initialize Spark session with increased memory
spark = SparkSession.builder \
    .appName("WARK Data Pipeline - Predict Subject Area") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


# Load CSV data into a Spark DataFrame
data_path = "/Users/tentachita/Downloads/DSDE_Final_Project_WARK/Data_Aj/2/joined_2018-2023.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Show the data schema
df.printSchema()
df.show(5)

24/12/08 00:33:02 WARN Utils: Your hostname, Tents-MacBook.local resolves to a loopback address: 127.0.0.1; using 10.201.242.183 instead (on interface en0)
24/12/08 00:33:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/08 00:33:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- classifications: string (nullable = true)
 |-- subject_area_name: string (nullable = true)
 |-- subject_area_code: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      citation_title|           abstracts|             authors|        affiliations|     classifications|   subject_area_name|   subject_area_code|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Public health and...|                   -|Pongpirul Krit; L...|Stanford Universi...|ASJC: 2700; SUBJA...|      Medicine (all)|                2700|
|Flexible Printed ...|© 2018 The Instit...|Pratumsiri Teerap...|Chulalongkorn Uni..

In [3]:
# Drop the redundant column
df = df.drop('subject_area_code')

# Show the data schema
df.printSchema()
df.show(5)


root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- classifications: string (nullable = true)
 |-- subject_area_name: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      citation_title|           abstracts|             authors|        affiliations|     classifications|   subject_area_name|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Public health and...|                   -|Pongpirul Krit; L...|Stanford Universi...|ASJC: 2700; SUBJA...|      Medicine (all)|
|Flexible Printed ...|© 2018 The Instit...|Pratumsiri Teerap...|Chulalongkorn Uni...|ASJC: 2208\2504; ...|Electrical and El...|
|Parametric study ...|© 2018 Elsevier L...|Phuakpunk Kiattik...|Chulalongkorn Uni...|CPX

In [4]:
from pyspark.sql.functions import col, concat_ws

# Replace null or missing values
df_cleaned = df.fillna({
    'citation_title': '-',
    'abstracts': '-',
    'authors': '-',
    'affiliations': '-',
    'classifications': '-',
    'subject_area_name': '-',
})

# Combine relevant fields into a single feature
df_cleaned = df_cleaned.withColumn(
    "features_combined", 
    concat_ws("||", col("citation_title"), col("abstracts"), col("authors"), col("affiliations"), col("classifications"))
)

# Show the combined features
df_cleaned.select("features_combined", "subject_area_name").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Ensure no null values in required columns
df_cleaned = df_cleaned.fillna({"features_combined": "-", "subject_area_name": "-"})

# Tokenize the combined features
tokenizer = Tokenizer(inputCol="features_combined", outputCol="tokens")

# Convert tokens into numerical features
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=5000)

# Encode target labels into numerical format
label_indexer = StringIndexer(inputCol="subject_area_name", outputCol="label")

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline(stages=[tokenizer, vectorizer, label_indexer])

# Fit and transform the data
preprocessed_data = preprocessing_pipeline.fit(df_cleaned).transform(df_cleaned)

# Verify the transformed data
preprocessed_data.select("features", "label").show(5)



                                                                                

+--------------------+------+
|            features| label|
+--------------------+------+
|(5000,[1,2,6,10,1...|   1.0|
|(5000,[0,1,2,3,4,...|2615.0|
|(5000,[0,1,2,3,4,...| 119.0|
|(5000,[0,1,2,3,4,...| 191.0|
|(5000,[0,1,2,3,4,...| 230.0|
+--------------------+------+
only showing top 5 rows



In [6]:
# Check if features and label columns are present
preprocessed_data.select("features", "label").show(5)

# Split the dataset into training and testing sets
train_data, test_data = preprocessed_data.randomSplit([0.7, 0.3], seed=42)

# Sample a fraction of the data
train_data = train_data.sample(fraction=0.1, seed=42)
test_data = test_data.sample(fraction=0.1, seed=42)


print(f"Training Data Count: {train_data.count()}, Test Data Count: {test_data.count()}")


+--------------------+------+
|            features| label|
+--------------------+------+
|(5000,[1,2,6,10,1...|   1.0|
|(5000,[0,1,2,3,4,...|2615.0|
|(5000,[0,1,2,3,4,...| 119.0|
|(5000,[0,1,2,3,4,...| 191.0|
|(5000,[0,1,2,3,4,...| 230.0|
+--------------------+------+
only showing top 5 rows





Training Data Count: 1400, Test Data Count: 580


                                                                                

In [7]:
from pyspark.ml.classification import LogisticRegression

# Initialize the logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Train the model
lr_model = lr.fit(train_data)

# Evaluate the model on the test data
predictions = lr_model.transform(test_data)

# Display predictions
predictions.select("features", "label", "prediction").show(5)


24/12/08 00:33:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/08 00:33:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/12/08 00:33:22 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/12/08 00:34:07 WARN DAGScheduler: Broadcasting large task binary with size 131.6 MiB
[Stage 46:>                                                         (0 + 1) / 1]

+--------------------+------+----------+
|            features| label|prediction|
+--------------------+------+----------+
|(5000,[0,1,2,3,4,...|  24.0|       1.0|
|(5000,[0,1,2,3,4,...|1860.0|       3.0|
|(5000,[0,1,2,3,4,...|  51.0|      28.0|
|(5000,[0,1,2,3,4,...| 675.0|      13.0|
|(5000,[0,1,2,3,4,...|   1.0|       1.0|
+--------------------+------+----------+
only showing top 5 rows



                                                                                

In [8]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")


24/12/08 00:34:09 WARN DAGScheduler: Broadcasting large task binary with size 131.6 MiB

Accuracy: 0.18


                                                                                

In [19]:
#แตกกกกกกกกก here 
# Ensure preprocessing_pipeline is defined earlier
# Save the trained model
lr_model.save("logistic_regression_model")

# Save the preprocessing pipeline
preprocessing_pipeline.save("preprocessing_pipeline")


IllegalArgumentException: Output column tokens already exists.

In [16]:
#อันนี้คือเอามา check
import shutil
import os

model_path = "logistic_regression_model"
pipeline_path = "preprocessing_pipeline"


for path in [model_path, pipeline_path]:
    if os.path.exists(path):
        shutil.rmtree(path)  


try:
    lr_model.save(model_path)
    print(f"success {model_path}")
except Exception as e:
    print(f"fail: {e}")


try:
    preprocessing_pipeline.save(pipeline_path)
    print(f"success {pipeline_path}")
except Exception as e:
    print(f"fail Pipeline: {e}")


24/12/08 00:54:12 WARN TaskSetManager: Stage 62 contains a task of very large size (136858 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

success logistic_regression_model
success preprocessing_pipeline


In [13]:
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel

In [15]:


# Load the pipeline and model
loaded_pipeline = PipelineModel.load("preprocessing_pipeline")
loaded_model = LogisticRegressionModel.load("logistic_regression_model")

# Process new data
new_data = spark.createDataFrame([{
    "citation_title": "New Research in AI",
    "abstracts": "Deep learning advances...",
    "authors": "Author A; Author B",
    "affiliations": "University of XYZ",
    "classifications": "ASJC: 1700; SUBJABBR: COMP",
    "subject_area_name": "Unknown"
}])

new_data_cleaned = loaded_pipeline.transform(new_data)
new_predictions = loaded_model.transform(new_data_cleaned)
new_predictions.select("features", "prediction").show()


24/12/08 00:52:56 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: Error loading metadata: Expected class name org.apache.spark.ml.PipelineModel but found class name org.apache.spark.ml.Pipeline
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.util.DefaultParamsReader$.parseMetadata(ReadWrite.scala:610)
	at org.apache.spark.ml.util.DefaultParamsReader$.loadMetadata(ReadWrite.scala:588)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.$anonfun$load$3(Pipeline.scala:269)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.Pipeline$SharedReadWrite$.load(Pipeline.scala:268)
	at org.apache.spark.ml.PipelineModel$PipelineModelReader.$anonfun$load$7(Pipeline.scala:356)
	at org.apache.spark.ml.MLEvents.withLoadInstanceEvent(events.scala:160)
	at org.ap

IllegalArgumentException: requirement failed: Error loading metadata: Expected class name org.apache.spark.ml.PipelineModel but found class name org.apache.spark.ml.Pipeline