In [17]:
%pip install pyspark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
from pyspark.sql import SparkSession

# Initialize Spark session with increased memory
spark = SparkSession.builder \
    .appName("WARK Data Pipeline - Predict Subject Area") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()


# Load CSV data into a Spark DataFrame
data_path = "/Users/tentachita/Downloads/DSDE_Final_Project_WARK/Data_Aj/2/joined_2018-2023.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Show the data schema
df.printSchema()
df.show(5)

root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- classifications: string (nullable = true)
 |-- subject_area_name: string (nullable = true)
 |-- subject_area_code: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      citation_title|           abstracts|             authors|        affiliations|     classifications|   subject_area_name|   subject_area_code|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Public health and...|                   -|Pongpirul Krit; L...|Stanford Universi...|ASJC: 2700; SUBJA...|      Medicine (all)|                2700|
|Flexible Printed ...|© 2018 The Instit...|Pratumsiri Teerap...|Chulalongkorn Uni..

In [19]:
# Drop the redundant column
df = df.drop('subject_area_code')

# Show the data schema
df.printSchema()
df.show(5)


root
 |-- citation_title: string (nullable = true)
 |-- abstracts: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- affiliations: string (nullable = true)
 |-- classifications: string (nullable = true)
 |-- subject_area_name: string (nullable = true)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|      citation_title|           abstracts|             authors|        affiliations|     classifications|   subject_area_name|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Public health and...|                   -|Pongpirul Krit; L...|Stanford Universi...|ASJC: 2700; SUBJA...|      Medicine (all)|
|Flexible Printed ...|© 2018 The Instit...|Pratumsiri Teerap...|Chulalongkorn Uni...|ASJC: 2208\2504; ...|Electrical and El...|
|Parametric study ...|© 2018 Elsevier L...|Phuakpunk Kiattik...|Chulalongkorn Uni...|CPX

In [20]:
from pyspark.sql.functions import col, concat_ws

# Replace null or missing values
df_cleaned = df.fillna({
    'citation_title': '-',
    'abstracts': '-',
    'authors': '-',
    'affiliations': '-',
    'classifications': '-',
    'subject_area_name': '-',
})

# Combine relevant fields into a single feature
df_cleaned = df_cleaned.withColumn(
    "features_combined", 
    concat_ws("||", col("citation_title"), col("abstracts"), col("authors"), col("affiliations"), col("classifications"))
)

# Show the combined features
df_cleaned.select("features_combined", "subject_area_name").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Ensure no null values in required columns
df_cleaned = df_cleaned.fillna({"features_combined": "-", "subject_area_name": "-"})

# Tokenize the combined features
tokenizer = Tokenizer(inputCol="features_combined", outputCol="tokens")

# Convert tokens into numerical features
vectorizer = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=5000)

# Encode target labels into numerical format
label_indexer = StringIndexer(inputCol="subject_area_name", outputCol="label")

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline(stages=[tokenizer, vectorizer, label_indexer])

# Fit and transform the data
preprocessed_data = preprocessing_pipeline.fit(df_cleaned).transform(df_cleaned)

# Verify the transformed data
preprocessed_data.select("features", "label").show(5)



                                                                                

+--------------------+------+
|            features| label|
+--------------------+------+
|(5000,[1,2,6,10,1...|   1.0|
|(5000,[0,1,2,3,4,...|2615.0|
|(5000,[0,1,2,3,4,...| 119.0|
|(5000,[0,1,2,3,4,...| 191.0|
|(5000,[0,1,2,3,4,...| 230.0|
+--------------------+------+
only showing top 5 rows



In [22]:
# Check if features and label columns are present
preprocessed_data.select("features", "label").show(5)

# Split the dataset into training and testing sets
train_data, test_data = preprocessed_data.randomSplit([0.7, 0.3], seed=42)

# Sample a fraction of the data
train_data = train_data.sample(fraction=0.1, seed=42)
test_data = test_data.sample(fraction=0.1, seed=42)


print(f"Training Data Count: {train_data.count()}, Test Data Count: {test_data.count()}")


+--------------------+------+
|            features| label|
+--------------------+------+
|(5000,[1,2,6,10,1...|   1.0|
|(5000,[0,1,2,3,4,...|2615.0|
|(5000,[0,1,2,3,4,...| 119.0|
|(5000,[0,1,2,3,4,...| 191.0|
|(5000,[0,1,2,3,4,...| 230.0|
+--------------------+------+
only showing top 5 rows



                                                                                

Training Data Count: 1400, Test Data Count: 580


In [23]:
from pyspark.ml.classification import LogisticRegression

# Initialize the logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=10)

# Train the model
lr_model = lr.fit(train_data)

# Evaluate the model on the test data
predictions = lr_model.transform(test_data)

# Display predictions
predictions.select("features", "label", "prediction").show(5)


24/12/08 02:28:15 WARN DAGScheduler: Broadcasting large task binary with size 131.6 MiB
[Stage 95:>                                                         (0 + 1) / 1]

+--------------------+------+----------+
|            features| label|prediction|
+--------------------+------+----------+
|(5000,[0,1,2,3,4,...|  24.0|       1.0|
|(5000,[0,1,2,3,4,...|1860.0|       3.0|
|(5000,[0,1,2,3,4,...|  51.0|      28.0|
|(5000,[0,1,2,3,4,...| 675.0|      13.0|
|(5000,[0,1,2,3,4,...|   1.0|       1.0|
+--------------------+------+----------+
only showing top 5 rows



                                                                                

In [24]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy:.2f}")


24/12/08 02:28:16 WARN DAGScheduler: Broadcasting large task binary with size 131.6 MiB

Accuracy: 0.18


                                                                                

In [35]:
from pyspark.ml.classification import LogisticRegressionModel

# Ensure lr_model is an instance of LogisticRegressionModel
assert isinstance(lr_model, LogisticRegressionModel), "lr_model is not a LogisticRegressionModel!"

# Save the trained model with overwrite
lr_model.write().overwrite().save("logistic_regression_model")
print("Logistic Regression model successfully saved.")

# Save the preprocessing pipeline with overwrite
preprocessing_pipeline.write().overwrite().save("preprocessing_pipeline")
print("Preprocessing pipeline successfully saved.")



24/12/08 02:39:46 WARN TaskSetManager: Stage 125 contains a task of very large size (136858 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Logistic Regression model successfully saved.
Preprocessing pipeline successfully saved.


In [36]:
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel

In [40]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, CountVectorizer, StringIndexer
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Example: Define your data (this should be your actual data)
training_data = spark.createDataFrame([{
    "citation_title": "Research in AI",
    "abstracts": "Deep learning advances in AI...",
    "authors": "Author A; Author B",
    "affiliations": "University of XYZ",
    "classifications": "ASJC: 1700; SUBJABBR: COMP",
    "subject_area_name": "AI"
}])

# Define your feature transformations and pipeline
tokenizer = Tokenizer(inputCol="abstracts", outputCol="words")
vectorizer = CountVectorizer(inputCol="words", outputCol="features")
indexer = StringIndexer(inputCol="subject_area_name", outputCol="label")

# Create the pipeline
preprocessing_pipeline = Pipeline(stages=[tokenizer, vectorizer, indexer])

# Fit the pipeline to your data (this is the model that should be saved)
trained_pipeline_model = preprocessing_pipeline.fit(training_data)

# Save the trained pipeline model with overwrite
trained_pipeline_model.write().overwrite().save("preprocessing_pipeline")

# Use the pipeline to transform the data into the format expected by logistic regression
training_data_transformed = trained_pipeline_model.transform(training_data)

# Now, train your logistic regression model on the transformed data
lr_model = LogisticRegression(featuresCol="features", labelCol="label")
lr_model_trained = lr_model.fit(training_data_transformed)

# Save the trained logistic regression model with overwrite
lr_model_trained.write().overwrite().save("logistic_regression_model")

# --- Later, when you want to load and use the saved models for new data ---

# Load the trained preprocessing pipeline model
loaded_pipeline = PipelineModel.load("preprocessing_pipeline")

# Load the trained logistic regression model
loaded_model = LogisticRegressionModel.load("logistic_regression_model")

# Process new data
new_data = spark.createDataFrame([{
    "citation_title": "New Research in AI",
    "abstracts": "Deep learning advances...",
    "authors": "Author A; Author B",
    "affiliations": "University of XYZ",
    "classifications": "ASJC: 1700; SUBJABBR: COMP",
    "subject_area_name": "Unknown"
}])

# Transform new data using the loaded preprocessing pipeline
new_data_cleaned = loaded_pipeline.transform(new_data)

# Get predictions using the loaded model
new_predictions = loaded_model.transform(new_data_cleaned)

# Show the predictions
new_predictions.select("features", "prediction").show()


24/12/08 02:54:39 WARN Instrumentation: [be91165c] All labels are the same value and fitIntercept=true, so the coefficients will be zeros. Training is not needed.


+-------------------+----------+
|           features|prediction|
+-------------------+----------+
|(5,[2,3],[1.0,1.0])|       0.0|
+-------------------+----------+

