In [None]:
import torch

# DF stuff
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

# ML libs
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from transformers import AutoTokenizer, AutoModel

from classifiers import SENTIMENTS_AS_INDEX


spark = SparkSession.builder.master("local[*]").getOrCreate()

# Load PhoBERT

In [None]:
device = torch.device("cuda"if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2", use_fast=False)
model = AutoModel.from_pretrained("vinai/phobert-base-v2").to(device)
model.eval()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

# Load and transform data

In [None]:
train_df = spark.read.parquet("hdfs://namenode:9000/training_data/train_set")
test_df = spark.read.parquet("hdfs://namenode:9000/training_data/test_set")
print(f"Train size: {train_df.count()}, Test size: {test_df.count()}")

In [None]:
@udf(returnType=VectorUDT())
def create_embedding(text):
    """
    Spark UDF for creating embeddings output from
        [PhoBERT](https://github.com/VinAIResearch/PhoBERT).
    This function essentially feeds the text through PhoBERT and take the output
        of the first token (the `[CLS]` or `<s>` token). The output dimension is 768.

        :param str text: Text to create embeddings from
    """
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    input_ids = tokens["input_ids"].to(device)
    attention_mask = tokens["attention_mask"].to(device)
    with torch.no_grad():
        output = model(input_ids, attention_mask)
    return Vectors.dense(output.last_hidden_state[0, 0, :].cpu().numpy())

@udf(returnType=IntegerType())
def sent_to_idx(sentiment): return SENTIMENTS_AS_INDEX[sentiment]

In [None]:
# convert to DenseVector for training
train_df = train_df.withColumn('features', create_embedding(col('review')))
test_df = test_df.withColumn('features', create_embedding(col('review')))

# Convert sentiment to index
train_df = train_df.withColumn('label_idx', sent_to_idx(col('sentiment')))
test_df = test_df.withColumn('label_idx', sent_to_idx(col('sentiment')))

# Model Training

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="label_idx")

# define param grid for grid search
paramGrid = (ParamGridBuilder()
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
    .addGrid(lr.regParam, [0.01, 0.1, 1.0])
    .addGrid(lr.maxIter, [10, 20])
    .build()
)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label_idx",
    predictionCol="prediction",
    metricName="accuracy"
)

cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3
)

In [None]:
cv_model = cv.fit(train_df)

In [None]:
best_model = cv_model.bestModel
best_params = best_model.extractParamMap()

print("Best hyperparameter configuration:")
for param, value in best_params.items():
    if param.name in ["regParam", "maxIter", "elasticNetParam"]:
        print(f"  {param.name}: {value}")

Best hyperparameter configuration:
  elasticNetParam: 0.0
  maxIter: 10
  regParam: 0.1


In [None]:
predictions = best_model.transform(test_df)
accuracy = evaluator.evaluate(predictions)
print(f"Test accuracy: {accuracy:.4f}")

Test accuracy: 0.9316


# Save model

In [None]:
best_model.save('work/models/lr_sentiment_model')