In [2]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import LDA

In [3]:
spark = SparkSession.builder.appName("spark-ml-kmeans").getOrCreate()

sparkContext = spark.sparkContext
sqlContext = SQLContext(sparkContext)

In [15]:
# чтобы все spark работали одинаково (версии 2 и 3)
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

DataFrame[key: string, value: string]

### Создадим DataFrame

In [4]:
!ls

 0.1_into.ipynb			        Advertising.csv
 0_workshop_nba.ipynb		       'Online Retail.csv'
 1_rfm-empty.ipynb		        SMSSpamCollection
 1_rfm.ipynb			        add_1_pandas_udf.ipynb
 1_simple_binar_model.ipynb	       'add_2_pandas_udf .ipynb'
 2_all_data_manipulation_in_one.ipynb   banks.csv
 3_Regression.ipynb		        mails
 4_Classification.ipynb		        nbagames_short.json
 4_Classification_empty.ipynb	        pc.png


In [15]:
# 1 - читаем файл из файла (регуляркой)
# 2 - извлекаем первые 5 строк
# 3 - делаем DF с разделителем # 
# 4 - сохраняем в файл

In [16]:
!sed -i -e 's/\r//g' mails/* | head -q -n 5 mails/* | paste - - - - - -d "#"> output.csv

### Топ 5 самых активных пользователей за неделю

In [30]:
# неделя корректно извлечена
df1 = df.withColumn("week", F.weekofyear(F.unix_timestamp(df.date, "EEE, dd MMM yyyy HH").cast("timestamp")))
df1.show(3)

+--------------------+-------------------+--------------------+--------------------+---------------+----+
|           messageid|               date|               from_|                 to_|        subject|week|
+--------------------+-------------------+--------------------+--------------------+---------------+----+
|Message-ID: <1705...| Mon, 6 Mar 2000 06|mike.carson@enron...|tara.sweitzer@enr...|  ENRON On_Line|  10|
|Message-ID: <1858...|Tue, 28 Mar 2000 02|mike.carson@enron...|   m_besch@yahoo.com|     Re: Moving|  13|
|Message-ID: <4934...| Fri, 1 Dec 2000 04|james.derrick@enr...|nbrazzil@mail.law...|Re: san antonio|  48|
+--------------------+-------------------+--------------------+--------------------+---------------+----+
only showing top 3 rows



In [31]:
# максимальное значение по неделе
maxweek = df1.agg(F.max(df1.week)).first()[0]
print(maxweek)
df1.agg(F.max(df1.week)).collect()

48


[Row(max(week)=48)]

In [32]:
df1.groupBy("from_").count().show()

+--------------------+-----+
|               from_|count|
+--------------------+-----+
|james.derrick@enr...|    9|
|mike.carson@enron...|    9|
+--------------------+-----+



In [33]:
df1.groupBy("from_").count().withColumn("avgcount", F.col("count") / maxweek).sort(F.col("avgcount").desc()).show()

+--------------------+-----+--------+
|               from_|count|avgcount|
+--------------------+-----+--------+
|mike.carson@enron...|    9|  0.1875|
|james.derrick@enr...|    9|  0.1875|
+--------------------+-----+--------+



### Топ ключевых слов в темах письма

• для топ 5 самых активных пользователей

• для малоактивных пользователей

In [35]:
# создаем токены из предложений
tokenizer = Tokenizer().setInputCol("subject").setOutputCol("words")
# трансформируем DF
transformed = tokenizer.transform(df1)

In [39]:
# группируем, считаем количество, сортируем по убыванию
top  = df1.groupby("from_").count().sort(F.col("count").desc()).take(10)
top

[Row(from_='james.derrick@enron.com', count=9),
 Row(from_='mike.carson@enron.com', count=9)]

In [40]:
# convert Row(key1=val1, key2 = val2) to Dictionary form {key1:val1, key2:val2}
print(top[0].asDict())

{'from_': 'james.derrick@enron.com', 'count': 9}


In [41]:
top_users = [v.asDict()["from_"] for v in top]
top_users

['james.derrick@enron.com', 'mike.carson@enron.com']

In [45]:
# сделаем фильтр, где будут только топовые пользователей 
topuserdata = transformed.filter(transformed.subject != "").filter(transformed.from_.isin(top_users))
print(topuserdata.count())

17


In [46]:
topuserdata.show(3)

+--------------------+-------------------+--------------------+--------------------+---------------+----+-------------------+
|           messageid|               date|               from_|                 to_|        subject|week|              words|
+--------------------+-------------------+--------------------+--------------------+---------------+----+-------------------+
|Message-ID: <1705...| Mon, 6 Mar 2000 06|mike.carson@enron...|tara.sweitzer@enr...|  ENRON On_Line|  10|   [enron, on_line]|
|Message-ID: <1858...|Tue, 28 Mar 2000 02|mike.carson@enron...|   m_besch@yahoo.com|     Re: Moving|  13|      [re:, moving]|
|Message-ID: <4934...| Fri, 1 Dec 2000 04|james.derrick@enr...|nbrazzil@mail.law...|Re: san antonio|  48|[re:, san, antonio]|
+--------------------+-------------------+--------------------+--------------------+---------------+----+-------------------+
only showing top 3 rows



In [48]:
# созадим DF из топовых слов
# колонки: слово / количество 
topuserdata.withColumn("keyword", F.explode("words")).groupBy("keyword").count().sort(F.col("count").desc()).show(20)

+------------+-----+
|     keyword|count|
+------------+-----+
|         re:|   13|
|          st|    3|
|           -|    3|
|    patricks|    3|
|    projects|    2|
|     counsel|    2|
|         and|    2|
|        eecc|    2|
|         i'm|    2|
|      oregon|    2|
|       lunch|    2|
|         san|    2|
|     outside|    2|
|construction|    2|
|       back!|    2|
|     antonio|    2|
|         bio|    1|
|    personal|    1|
|     on_line|    1|
|          me|    1|
+------------+-----+
only showing top 20 rows



In [50]:
# а что ещё осталось?
otheruserdata = transformed.filter(transformed.subject != "").filter(transformed.from_.isin(top_users) == False)
otheruserdata.withColumn("keyword",F.explode("words")).groupBy("keyword").count().sort(F.col("count").desc()).show(20)

+-------+-----+
|keyword|count|
+-------+-----+
+-------+-----+



### Топ 10 слов без стоп-слов

In [54]:
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.filter(cleaned.subject != "").withColumn("keyword",F.explode(cleaned.words)).groupBy("keyword").count().sort(F.col("count").desc()).show()

+------------+-----+
|     keyword|count|
+------------+-----+
|         re:|   13|
|           -|    3|
|          st|    3|
|    patricks|    3|
|    projects|    2|
|         i'm|    2|
|     outside|    2|
|        eecc|    2|
|     antonio|    2|
|       back!|    2|
|      oregon|    2|
|       lunch|    2|
|         san|    2|
|construction|    2|
|         and|    2|
|     counsel|    2|
|     on_line|    1|
|    personal|    1|
|          30|    1|
|     morning|    1|
+------------+-----+
only showing top 20 rows



### Сделаем расширенный набор стоп-слов

In [55]:
stopWords = StopWordsRemover().getStopWords() + ["-", "re:", "fw:"]
remover = StopWordsRemover().setStopWords(stopWords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)

cleaned.filter(cleaned.subject != "").withColumn("keyword",F.explode(cleaned.words)).groupBy("filtered").count().sort(F.col("count").desc()).show()

+--------------------+-----+
|            filtered|count|
+--------------------+-----+
|[outside, counsel...|   18|
|      [st, patricks]|    8|
| [fax, november, 30]|    7|
|      [san, antonio]|    6|
|             [back!]|    6|
|[sfa, updating, p...|    6|
|             [lunch]|    4|
|[good, morning, ???]|    4|
|    [enron, on_line]|    2|
|            [moving]|    2|
|               [bio]|    1|
+--------------------+-----+




### Определим ответные сообщения (RE:) и пересылаемые (Fw:)

In [56]:
df2 = cleaned.withColumn("msgtype", 
                         F.when(cleaned.subject.startswith("Re:"),1). \
                         otherwise(F.when(cleaned.subject.startswith("Fw:"),2). \
                         otherwise(0)))

df2.select("msgtype").head(5)

[Row(msgtype=0),
 Row(msgtype=1),
 Row(msgtype=1),
 Row(msgtype=1),
 Row(msgtype=1)]

### Сделаем Pivot по index = неделя, колонка = тип сообщения

In [59]:
df2.groupBy("week").pivot("msgtype").count().show()

+----+----+----+
|week|   0|   1|
+----+----+----+
|  12|   1|   1|
|  47|null|   1|
|  13|null|   1|
|  48|   1|   4|
|  23|   1|null|
|  10|   1|null|
|  24|null|   2|
|  11|   2|   3|
+----+----+----+



### k-means кластеризация для извлечения слов

In [60]:
# слова в вектор
df4 = df2.filter(df2.subject != "")
cvmodel =CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(df4)
featured = cvmodel.transform(df4)
featured.head()

Row(messageid='Message-ID: <17050056.1075858194622.JavaMail.evans@thyme>', date='Mon, 6 Mar 2000 06', from_='mike.carson@enron.com', to_='tara.sweitzer@enron.com', subject='ENRON On_Line', week=10, words=['enron', 'on_line'], filtered=['enron', 'on_line'], msgtype=0, features=SparseVector(26, {15: 1.0, 23: 1.0}))

In [63]:
# kmeans 
kmeans = KMeans().setK(4).setSeed(28)
model = kmeans.fit(featured)
predictions = model.transform(featured)

### LDA для разделения слов на 4 топика (группы)

[Что такое LDA](https://towardsdatascience.com/latent-dirichlet-allocation-lda-9d1cd064ffa2)

In [64]:
lda = LDA().setK(4).setMaxIter(10)
model = lda.fit(featured)
topics = model.describeTopics(4)
topics.show()

+-----+---------------+--------------------+
|topic|    termIndices|         termWeights|
+-----+---------------+--------------------+
|    0| [3, 13, 22, 9]|[0.07443348665158...|
|    1| [0, 1, 21, 20]|[0.07605311403745...|
|    2|[17, 23, 0, 20]|[0.04347601255132...|
|    3|  [11, 5, 6, 4]|[0.08665307397939...|
+-----+---------------+--------------------+



In [65]:
topic_indices = topics.select("termIndices").rdd.map(lambda x:x[0][0]).collect()
[cvmodel.vocabulary[v] for v in topic_indices]

['lunch', 'patricks', 'personal', 'oregon']

## Задание:
### Изучите файл `1_simple_binar_model.ipynb` и сделайте
- 6 топиков для Спам
- 6 топиков для Не Спам сообщений