In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.feature import NGram

from pyspark.ml import Pipeline, PipelineModel


In [2]:
spark = SparkSession.builder.appName("Spark-ML").getOrCreate()
sparkContext = spark.sparkContext
sqlContext = SQLContext(sparkContext)

In [5]:
# read (Spark Session)
location = "data/SMSSpamCollection"
raw_df = spark.read.option("delimiter", "\t").csv(location).toDF("spam", "message")

raw_df.show(5, False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|spam|message                                                                                                                                                    |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham |U dun say so ear

In [10]:
# read (Spark Contex | RDD)

from pyspark.sql import Row


raw = sparkContext.textFile(location) \
                  .map(lambda line: line.split("\t")) \
                  .map(lambda row: Row(spam=row[0].strip(), message=row[1].strip()))

raw_df = spark.createDataFrame(raw)
raw_df.show(5, False)

+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|spam|message                                                                                                                                                    |
+----+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|ham |Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            |
|ham |Ok lar... Joking wif u oni...                                                                                                                              |
|spam|Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's|
|ham |U dun say so ear

### Все слова из SMS

In [13]:
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
# transformed = tokenizer.transform(raw_df)
# transformed.show(3)

### Удалим стоп-слова )

In [15]:
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
# cleaned = remover.transform(transformed)
# cleaned.show(3)

In [16]:
StopWordsRemover().getStopWords()[115:120]

['so', 'than', 'too', 'very', 's']

### Добавим в стоп-слово ‘-‘

In [23]:
# повторим действие с расширенным списком
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(3)

+----+--------------------+--------------------+--------------------+
|spam|             message|               words|            filtered|
+----+--------------------+--------------------+--------------------+
| ham|Go until jurong p...|[go, until, juron...|[go, jurong, poin...|
| ham|Ok lar... Joking ...|[ok, lar..., joki...|[ok, lar..., joki...|
|spam|Free entry in 2 a...|[free, entry, in,...|[free, entry, 2, ...|
+----+--------------------+--------------------+--------------------+
only showing top 3 rows



### Создание из текст SMS в фичи для CountVectorizer


In [24]:
# генерация фичей (для модели)
count_vect_model = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = count_vect_model.transform(cleaned)

# конвертация спам / не спам в бинарный признак (а как правильно делать OHE? Какие варианты вы знаете?)
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
# indexed = indexer.transform(featured)
# indexed.show(3)

### Разделяем на train / test

А какие способы разделения Вы ещё знаете?

Какая между ними разница?


In [25]:
train, test = indexed.randomSplit([0.7, 0.3], seed = 28)
train.show(3)

+----+--------------------+--------------------+--------------------+--------------------+-----+
|spam|             message|               words|            filtered|            features|label|
+----+--------------------+--------------------+--------------------+--------------------+-----+
| ham|"ALRITE HUNNY!WOT...|["alrite, hunny!w...|["alrite, hunny!w...|(13463,[0,2,65,20...|  0.0|
| ham|"CAN I PLEASE COM...|["can, i, please,...|["can, please, co...|(13463,[8,14,42,6...|  0.0|
| ham|"Getting tickets ...|["getting, ticket...|["getting, ticket...|(13463,[2,8,18,30...|  0.0|
+----+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



### Logistic Regression

Почему этот вариант?

Как выглядит ЛогРег?

Что ещё можно с помощью этого алгоритма(фокус на кривую) измерять (или он только для бинарной классификации)?

In [31]:
# установка параметров
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

# обучение
lrModel = lr.fit(train)

# пре
predictions = lrModel.transform(test)

predictions.show(5)

# predictions.select("features", "label", "prediction").show(5)

# оценим качество
# roc-auc - что это, как строится данная кривая?
# evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
# accuracy = evaluator.evaluate(predictions)
# print("roc_auc ", accuracy)

+----+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|spam|             message|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+----+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
| ham|"AH POOR BABY!HOP...|["ah, poor, baby!...|["ah, poor, baby!...|(13463,[0,2,7,66,...|  0.0|[1.85777096161441...|[0.86503692386045...|       0.0|
| ham|"Are you comingdo...|["are, you, comin...|["are, comingdown...|(13463,[3585,1043...|  0.0|[1.85777096161441...|[0.86503692386045...|       0.0|
| ham|"Aww you must be ...|["aww, you, must,...|["aww, must, near...|(13463,[301,2849,...|  0.0|[1.85777096161441...|[0.86503692386045...|       0.0|
| ham|"BOO BABE! U ENJO...|["boo, babe!, u, ...|["boo, babe!, u, ...|(13463,[0,2,44,67...|  0.0|[1.8

### Random Forest


In [33]:
# Random Forest
# А чем отличается RF от Boosting над решающими деревьями?
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(train)
predictions = model.transform(test)
# evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
# accuracy = evaluator.evaluate(predictions)
# print("roc_auc ", accuracy)

### N-граммы из слов

In [38]:
# создадим n-граммы из слов и мы добавми их в модель
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngrams_df = ngram.transform(indexed)

# повторяем векторизацию
count_vect_model = CountVectorizer().setInputCol("ngrams").setOutputCol("features2").fit(ngrams_df)
featured = count_vect_model.transform(ngrams_df)

train, test = featured.randomSplit([0.7, 0.3], seed = 28)
train.select("ngrams", "features", "features2").show(3)

+--------------------+--------------------+--------------------+
|              ngrams|            features|           features2|
+--------------------+--------------------+--------------------+
|["alrite hunny!wo...|(13463,[0,2,65,20...|(36628,[73,4554,6...|
|["can please, ple...|(13463,[8,14,42,6...|(36628,[5279,6510...|
|["getting tickets...|(13463,[2,8,18,30...|(36628,[121,1565,...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [45]:
# RF
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features2").setNumTrees(10)
model = rf.fit(train)
predictions = model.transform(test)
# evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
# accuracy = evaluator.evaluate(predictions)
# print("roc_auc ", accuracy)

### Data pipeline

In [46]:
# токены + удаляем стоп-слова
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")

# векторизируем
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")

# индексируем
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")

# создаем ЛогРег
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)

#
# Pipeline
#
# создание pipeline
# почему мы делаем pipeline?
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])

# обучаем (запуск всех шагов выполнения с 1 по N)
model = pipeline.fit(raw_df)

# сохраняем результат
# почему мы сохраняем весь pipeline, а не одну модель?
model.write().overwrite().save("model/spam_model1")

In [47]:
# загрузка
pipeline = pipeline = PipelineModel.load("model/spam_model1")

In [50]:
# воспроизводим результат из сохранения
lr_predictions = pipeline.transform(raw_df)
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("roc_auc ", accuracy)

roc_auc  0.5
