Skip to content
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
89 lines (70 sloc) 2.92 KB
package nlp
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.util.Benchmark
import org.apache.spark.sql.SparkSession
* Реализация алгоритма определения сентиментов из
object TrainViveknSentiment extends App {
// Создание сесси для запуска модели
val spark: SparkSession = SparkSession
.config("spark.driver.memory", "4G")
import spark.implicits._
// Датасет для обучения, размеченый на позитивные и неагтивные оценки
val training = Seq(
("I really liked this movie!", "positive"),
("The cast was horrible", "negative"),
("Never going to watch this again or recommend it to anyone", "negative"),
("It's a waste of time", "negative"),
("I loved the protagonist", "positive"),
("The music was really really good", "positive")
).toDS.toDF("train_text", "train_sentiment")
// Тестовые данные
val testing = Array(
"I don't recommend this movie, it's horrible",
"Dont waste your time!!!"
// Подготовка тестового датасета
val document = new DocumentAssembler()
// Токенизация
val token = new Tokenizer()
// Нормализация
val normalizer = new Normalizer()
// Подключение алгоритма Вивекена
val vivekn = new ViveknSentimentApproach()
.setInputCols("document", "normal")
// Установка колонок выхода алгоритма
val finisher = new Finisher()
// Создание цепочки из модулей
val pipeline = new Pipeline().setStages(Array(document, token, normalizer, vivekn, finisher))
// Обучение цепочки
val sparkPipeline =
val lightPipeline = new LightPipeline(sparkPipeline)
Benchmark.time("Light pipeline quick annotation") { lightPipeline.annotate(testing) }
// Вывод результатов
Benchmark.time("Spark pipeline, this may be too much for just two rows!") {
val testingDS = testing.toSeq.toDS.toDF("testing_text")
println("Updating DocumentAssembler input column")
You can’t perform that action at this time.