In [1]:
import os
import socket
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import BooleanType, IntegerType, LongType, StringType, ArrayType, FloatType, StructType, StructField
from pyspark.sql.functions import pandas_udf
from pyspark.sql.functions import PandasUDFType
from pyspark import StorageLevel
from jinja2 import Environment, FileSystemLoader

import pyspark.sql.functions as F
from pyspark.sql.functions import udf, length, when, col
from emoji import get_emoji_regexp, unicode_codes
import re


In [69]:
import numpy as np

In [2]:
# setting constants
APP_NAME = "YOUR_APP_NAME"
NORMALIZED_APP_NAME = APP_NAME.replace('/', '_').replace(':', '_')

APPS_TMP_DIR = os.path.join(os.getcwd(), "tmp")
APPS_CONF_DIR = os.path.join(os.getcwd(), "conf")
APPS_LOGS_DIR = os.path.join(os.getcwd(), "logs")
LOG4J_PROP_FILE = os.path.join(APPS_CONF_DIR, "pyspark-log4j-{}.properties".format(NORMALIZED_APP_NAME))
LOG_FILE = os.path.join(APPS_LOGS_DIR, 'pyspark-{}.log'.format(NORMALIZED_APP_NAME))
EXTRA_JAVA_OPTIONS = "-Dlog4j.configuration=file://{} -Dspark.hadoop.dfs.replication=1 -Dhttps.protocols=TLSv1.0,TLSv1.1,TLSv1.2,TLSv1.3"\
    .format(LOG4J_PROP_FILE)

LOCAL_IP = socket.gethostbyname(socket.gethostname())

# preparing configuration files from templates
for directory in [APPS_CONF_DIR, APPS_LOGS_DIR, APPS_TMP_DIR]:
    if not os.path.exists(directory):
        os.makedirs(directory)

env = Environment(loader=FileSystemLoader('/opt'))
template = env.get_template("pyspark_log4j.properties.template")
template\
    .stream(logfile=LOG_FILE)\
    .dump(LOG4J_PROP_FILE)

# run spark
spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master("k8s://https://10.32.7.103:6443")\
    .config("spark.driver.host", LOCAL_IP)\
    .config("spark.ui.port", "4040")\
    .config("spark.kubernetes.memoryOverheadFactor", "0.6")\
    .config("spark.driver.memory", "4g")\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.executor.instances", "5")\
    .config("spark.executor.cores", '4')\
    .config("spark.executor.memory", "5g")\
    .config("spark.memory.fraction", "0.6")\
    .config("spark.memory.storageFraction", "0.5")\
    .config("spark.sql.autoBroadcastJoinThreshold", "-1")\
    .config("spark.driver.extraJavaOptions", EXTRA_JAVA_OPTIONS)\
    .config("spark.kubernetes.namespace", "gkulagin-307618")\
    .config("spark.kubernetes.driver.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.label.appname", APP_NAME)\
    .config("spark.kubernetes.container.image.pullPolicy", "Always")\
    .config("spark.kubernetes.container.image", "node03.st:5000/spark-executor:gkulagin-307618")\
    .config("spark.kubernetes.executor.deleteOnTermination", "true")\
    .config("spark.local.dir", "/tmp/spark")\
    .getOrCreate()

In [3]:
# spark.stop()

In [4]:
#read wikipedia texts
test_df = spark.read.text("hdfs:///shared/wiki_corpus_english", lineSep="---END.OF.DOCUMENT---\n")
   # .withColumn("pp_id", F.row_number().over(Window.orderBy("value"))).cache()

In [6]:
import nltk
nltk.data.path.append("/home/jovyan/nfs-home/nltk_data")

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
#functions for performing tokenozation and cleaning
from pyspark.ml.feature import RegexTokenizer
regexTokenizer = RegexTokenizer(inputCol="cleaned", outputCol="tokens", pattern=r"\s+")

import nltk
from nltk.corpus import stopwords
en_stopwords = stopwords.words("english")

@udf(returnType=ArrayType(StringType()))
def preprocess_udf(tokens):
    return [word for word in tokens if (word not in en_stopwords) and (len(word) > 3)]

def explode_words(base_df, text_col_name):
    return regexTokenizer.transform(
            base_df.withColumn("cleaned", F.regexp_replace(text_col_name, "([^a-zA-Z\s]|\n)+", " "))
        ).withColumn("words_array", preprocess_udf("tokens"))

In [12]:
words = explode_words(test_df, 'value').repartition(10)

In [13]:
words.show(2)

+--------------------+-------+--------------------+--------------------+--------------------+
|               value|  pp_id|             cleaned|              tokens|         words_array|
+--------------------+-------+--------------------+--------------------+--------------------+
|
Hong Kong films ...|1201283| Hong Kong films ...|[hong, kong, film...|[hong, kong, film...|
|
Juha Mieto.
Juha...|1413176| Juha Mieto Juha ...|[juha, mieto, juh...|[juha, mieto, juh...|
+--------------------+-------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [14]:
from pyspark.ml.feature import CountVectorizer

In [15]:
#Model for creating word matrix 
Vector = CountVectorizer(inputCol="words_array", outputCol="vectors", vocabSize=5000, minDF=10.0)
model = Vector.fit(words)

In [16]:
result = model.transform(words)

In [17]:
result.show()

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+
|               value|  pp_id|             cleaned|              tokens|         words_array|             vectors|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+
|
Hong Kong films ...|1201283| Hong Kong films ...|[hong, kong, film...|[hong, kong, film...|(5000,[228,241,54...|
|
Juha Mieto.
Juha...|1413176| Juha Mieto Juha ...|[juha, mieto, juh...|[juha, mieto, juh...|(5000,[0,8,25,31,...|
|
16053 Brennan.
1...|  10615|  Brennan  Brenna...|[brennan, brennan...|[brennan, brennan...|(5000,[34,118,165...|
|
Nilsson Sings Ne...|1959996| Nilsson Sings Ne...|[nilsson, sings, ...|[nilsson, sings, ...|(5000,[10,22,38,4...|
|
Aleksandr Usov.
...| 163767| Aleksandr Usov A...|[aleksandr, usov,...|[aleksandr, usov,...|(5000,[2,8,10,25,...|
|
LCDR R class.
Th...|1520217| LCDR R class The...|[lcdr, r, class, ...|[lcdr, c

In [30]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vector, Vectors

In [32]:
corpus = result.select("pp_id", "vectors").rdd.map(lambda x: [x[0],Vectors.fromML(x[1])]).cache()

In [33]:
# performing LDA clustering
num_topics = 10
max_iterations = 100
ldaModel = LDA.train(corpus, k=num_topics,maxIterations=max_iterations)


In [34]:
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

In [41]:
wordNumbers = 10  # number of words per topic
topicIndices = SparkContext.getOrCreate().parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers), 10)

In [49]:
def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    values = topic[1]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        value = values[i]
        result.append([term, value])
    return result

In [50]:
topics_final_2 = topicIndices.map(lambda topic: topic_render(topic)).collect()

In [44]:
#topics and top words for each topic
for topic in range(len(topics_final)):
    print ("Topic" + str(topic) + ":")
    for term in topics_final[topic]:
        print (term)
    print ('\n')

Topic0:
born
john
church
work
first
life
became
family
later
died


Topic1:
used
system
also
time
number
formula
using
based
design
systems


Topic2:
government
party
political
would
people
court
election
state
union
country


Topic3:
british
army
first
force
battle
forces
military
august
united
september


Topic4:
city
area
south
north
county
located
town
west
district
river


Topic5:
school
university
college
state
national
high
public
company
students
also


Topic6:
also
many
used
known
found
often
species
long
name
water


Topic7:
film
series
show
also
time
would
television
story
first
character


Topic8:
first
team
season
game
played
league
club
football
career
games


Topic9:
album
music
band
released
song
years
single
also
population
american




In [51]:
topics_final_2

[[['born', 0.014906143038709973],
  ['john', 0.013290677702466579],
  ['church', 0.011054318910827431],
  ['work', 0.010994565165982583],
  ['first', 0.0103675867410336],
  ['life', 0.010058503577044173],
  ['became', 0.010047711777119151],
  ['family', 0.009761875628137085],
  ['later', 0.008955850745890118],
  ['died', 0.008813185240909877]],
 [['used', 0.011228794188256693],
  ['system', 0.010217236054961786],
  ['also', 0.0072296286060304525],
  ['time', 0.005160303839175323],
  ['number', 0.0049987006591577605],
  ['formula', 0.004867458011006839],
  ['using', 0.0048570965120763795],
  ['based', 0.0046646660141394635],
  ['design', 0.0042210956678548454],
  ['systems', 0.004077324038620379]],
 [['government', 0.012817274514444723],
  ['party', 0.01108290696294041],
  ['political', 0.007165457291397806],
  ['would', 0.006401080253748718],
  ['people', 0.006158141079973749],
  ['court', 0.006156535999765254],
  ['election', 0.006014939332135301],
  ['state', 0.005848695782747611],
 

In [48]:
topicIndices.collect()

[([25, 67, 103, 46, 1, 50, 24, 36, 11, 150],
  [0.014906143038709973,
   0.013290677702466579,
   0.011054318910827431,
   0.010994565165982583,
   0.0103675867410336,
   0.010058503577044173,
   0.010047711777119151,
   0.009761875628137085,
   0.008955850745890118,
   0.008813185240909877]),
 ([9, 68, 0, 2, 47, 296, 200, 66, 339, 370],
  [0.011228794188256693,
   0.010217236054961786,
   0.0072296286060304525,
   0.005160303839175323,
   0.0049987006591577605,
   0.004867458011006839,
   0.0048570965120763795,
   0.0046646660141394635,
   0.0042210956678548454,
   0.004077324038620379]),
 ([54, 92, 203, 5, 32, 209, 273, 15, 245, 140],
  [0.012817274514444723,
   0.01108290696294041,
   0.007165457291397806,
   0.006401080253748718,
   0.006158141079973749,
   0.006156535999765254,
   0.006014939332135301,
   0.005848695782747611,
   0.005200455176285104,
   0.005124696493549132]),
 ([65, 137, 1, 186, 234, 263, 190, 114, 14, 112],
  [0.011300607536734201,
   0.010504153714662241,
   0

In [52]:
from pyspark.mllib.feature import Normalizer
labels = result.select('value')
features = result.select('vectors')


In [56]:
#calcualting word count for each paper
result.withColumn('words_count', F.size('tokens')).show(5)

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-----------+
|               value|  pp_id|             cleaned|              tokens|         words_array|             vectors|words_count|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-----------+
|
Hong Kong films ...|1201283| Hong Kong films ...|[hong, kong, film...|[hong, kong, film...|(5000,[228,241,54...|         13|
|
Juha Mieto.
Juha...|1413176| Juha Mieto Juha ...|[juha, mieto, juh...|[juha, mieto, juh...|(5000,[0,8,25,31,...|        169|
|
16053 Brennan.
1...|  10615|  Brennan  Brenna...|[brennan, brennan...|[brennan, brennan...|(5000,[34,118,165...|         22|
|
Nilsson Sings Ne...|1959996| Nilsson Sings Ne...|[nilsson, sings, ...|[nilsson, sings, ...|(5000,[10,22,38,4...|         81|
|
Aleksandr Usov.
...| 163767| Aleksandr Usov A...|[aleksandr, usov,...|[aleksandr, usov,...|(5000,[2,8,10,25,.

In [68]:
df = result.select('value', col('vectors'))

In [70]:
@udf(returnType=FloatType())
def cos_sim_udf(a,b):
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

In [78]:
#calculationg top separated articles
new_df = df.crossJoin(df.select(col("vectors").alias('vectors_2')))\
    .withColumn("product", cos_sim_udf(col('vectors'), col('vectors_2')))\
    .where("product<1").groupBy(col("vectors")).agg(F.max("product").name('max_product'))\
    .orderBy('max_product')

In [None]:
new_df.show()

In [None]:
#end of project

In [57]:
import matplotlib.pyplot as plt

In [58]:
plt.hist(result.select('words_count'))

AnalysisException: cannot resolve '`words_count`' given input columns: [cleaned, pp_id, tokens, value, vectors, words_array];
'Project ['words_count]
+- Project [value#0, pp_id#3, cleaned#111, tokens#118, words_array#127, UDF(words_array#127) AS vectors#241]
   +- Repartition 10, true
      +- Project [value#0, pp_id#3, cleaned#111, tokens#118, preprocess_udf(tokens#118) AS words_array#127]
         +- Project [value#0, pp_id#3, cleaned#111, UDF(cleaned#111) AS tokens#118]
            +- Project [value#0, pp_id#3, regexp_replace(value#0, ([^a-zA-Z\s]|
)+,  , 1) AS cleaned#111]
               +- Sample 0.0, 0.01, false, 5037635142968484735
                  +- Project [value#0, pp_id#3]
                     +- Project [value#0, pp_id#3, pp_id#3]
                        +- Window [row_number() windowspecdefinition(value#0 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS pp_id#3], [value#0 ASC NULLS FIRST]
                           +- Project [value#0]
                              +- Relation[value#0] text


In [53]:
normalizer = Normalizer()
data_2 = labels.zip(normalizer.transform(features))

AttributeError: 'DataFrame' object has no attribute 'zip'

In [3]:
posts_df = spark.read.json("/shared/bigdata20/posts_api.json")

In [4]:
posts_likes_df = spark.read.parquet("/shared/bigdata20/posts_likes.parquet")
followers_df = spark.read.parquet("/shared/bigdata20/followers.parquet")
followers_posts_df = spark.read.json("/shared/bigdata20/followers_posts_api_final.json")
followers_posts_likes_df = spark.read.parquet("/shared/bigdata20/followers_posts_likes.parquet")

## Task 1 
Find the top 20 posts in the group: (a) by likes; (b) by comments; (c) by reposts. 

In [5]:
posts_df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- type: string (nullable = true)
 |   

In [7]:
#(a) by likes
posts_df.select('id', col('likes.count').name('likes count'))\
        .orderBy("likes.count", ascending=False)\
        .show(20)

+-----+-----------+
|   id|likes count|
+-----+-----------+
|32022|       1637|
|35068|       1629|
|17492|       1516|
|18526|       1026|
|19552|        955|
|41468|        952|
|19419|        868|
|29046|        824|
|32546|        786|
|24085|        765|
|40180|        759|
|33658|        708|
|13532|        633|
|40842|        631|
|35117|        588|
|17014|        581|
|19583|        553|
|19809|        552|
|27455|        550|
|11999|        549|
+-----+-----------+
only showing top 20 rows



In [8]:
#(b) by comments
posts_df.select('id', col('comments.count').name('comments count'))\
        .orderBy("comments.count", ascending=False)\
        .show(20)

+-----+--------------+
|   id|comments count|
+-----+--------------+
|24085|           850|
|22540|           250|
|27722|           192|
| 8285|           148|
|26860|           113|
|13571|           107|
|39294|           104|
|36680|            96|
|41739|            92|
|26006|            92|
|12426|            91|
|21499|            88|
|39163|            83|
|39407|            83|
|11267|            81|
|31548|            80|
|11158|            70|
|39082|            67|
|14602|            61|
|12687|            61|
+-----+--------------+
only showing top 20 rows



In [9]:
#(c) by reposts
posts_df.select('id', col('reposts.count').name('reposts count'))\
        .orderBy("reposts.count", ascending=False)\
        .show(20)

+-----+-------------+
|   id|reposts count|
+-----+-------------+
|17492|          334|
|19552|          246|
|32022|          210|
|11842|          129|
|19419|          126|
|13532|          110|
|17014|          105|
|35068|          101|
|41266|           92|
|12593|           90|
|29046|           87|
|11999|           85|
|41468|           85|
|19809|           84|
|17167|           81|
|10833|           78|
|18543|           77|
|16596|           76|
|18156|           74|
|37262|           71|
+-----+-------------+
only showing top 20 rows



## Task 2 
Find the top 20 users by (a) likes and (b) reposts they have made (to trace reposts use "copy_history" field) 

In [10]:
posts_likes_df.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [11]:
#top 20 users by likes they have made
posts_likes_df.groupby('likerId')\
              .agg(F.count('itemId').name('likes_count'))\
              .orderBy('likes_count', ascending=False)\
              .select(col('likerId').name('user_id'), 'likes_count')\
              .show(20)

+---------+-----------+
|  user_id|likes_count|
+---------+-----------+
|  2070090|       4801|
|  2397858|       2055|
|  1475301|       1829|
|    18239|       1569|
|   546612|       1245|
|     6371|        907|
|  1841959|        746|
| 78440957|        709|
|   120248|        699|
| 40981497|        611|
|    22158|        553|
|207628162|        548|
|329377723|        504|
| 76071304|        474|
| 14805173|        440|
|   317799|        385|
| 56355640|        375|
| 52042971|        338|
|  7437271|        336|
|136506644|        335|
+---------+-----------+
only showing top 20 rows



In [12]:
followers_posts_df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- lat: double (nullable = true)
 |    |    |    |    |-- long: double (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    

In [13]:
#top 20 users by reposts they have made
followers_posts_df.select('owner_id', col("copy_history.id").getItem(0).name('post_id'))\
        .na.drop()\
        .groupby('owner_id')\
        .agg(F.count('post_id').name('posts_num'))\
        .orderBy('posts_num', ascending=False)\
        .show()

+---------+---------+
| owner_id|posts_num|
+---------+---------+
|  2547211|    37742|
|357231922|    23349|
|168543860|    18429|
| 25646344|    11122|
|176861294|     9022|
|524656784|     7242|
|    29840|     7164|
|143207077|     7161|
|141687240|     6804|
|459339006|     6741|
|514384760|     6570|
|483715951|     6052|
|445159771|     5808|
|451211328|     5646|
|426396104|     5533|
|  8325325|     5532|
|452280411|     5458|
|464220898|     5318|
|440454268|     5304|
|461319529|     5240|
+---------+---------+
only showing top 20 rows



## Task 3 
get reposts of the original posts of the itmo group (posts.json) from user posts (the result should be similar to (group_post_id, Array (user_post_ids)))

In [14]:
followers_posts_df.select(
                        col('id').name('user_post_id'),
                        col("copy_history.id").getItem(0).name("post_id"),
                        col("copy_history.owner_id").getItem(0).name("owner_id")
                        )\
                  .join(
                        posts_df.select(col('id').name('post_id'), "owner_id"),
                        ['owner_id', 'post_id']
                        )\
                  .groupBy('post_id')\
                  .agg(F.collect_list('user_post_id').name('user_post_ids'))\
                  .withColumn('reposts_num', F.size('user_post_ids'))\
                  .orderBy('reposts_num', ascending=False)\
                  .show()
                                       

+-------+--------------------+-----------+
|post_id|       user_post_ids|reposts_num|
+-------+--------------------+-----------+
|  41266|[1265, 1748, 88, ...|         30|
|  41468|[1758, 1202, 390,...|         25|
|  42482|[1190, 264, 2033,...|         10|
|  40090|[349, 1760, 1325,...|          9|
|  39259|[1205, 10810, 106...|          8|
|  38740|[31900, 9561, 113...|          8|
|  41721|[274, 8, 10896, 4...|          6|
|  41207|[958, 2960, 4812,...|          6|
|  41546|[1161, 666, 3908,...|          6|
|  38963|[4868, 6403, 3720...|          5|
|  41506|[397, 398, 670, 1...|          5|
|  39682|[3368, 159, 600, ...|          5|
|  41708|[271, 10107, 5112...|          4|
|  42730|[7129, 4201, 5561...|          4|
|  39294|[2319, 4718, 4516...|          4|
|  38915|[18054, 4487, 470...|          4|
|  39686|[3370, 4741, 305,...|          4|
|  39515|[9218, 443, 7390,...|          4|
|  41108|[10559, 182, 366,...|          4|
|  41424|   [4125, 6941, 529]|          3|
+-------+--

## Task 4 
find emoticons in posts (negative, positive, neutral), calculate their overall count,  frequency (number of posts they can be found in) and average count per post. Print top 10 most popular emoticons, print top 5 emoticons which have the greatest difference between their overall count and frequency, print top 5 emoticons with average count per post.


In [15]:
!pip install --user --trusted-host pypi-registry.supplementary-services.svc.cluster.local --index http://pypi-registry.supplementary-services.svc.cluster.local:8080/ emoji

Looking in indexes: http://pypi-registry.supplementary-services.svc.cluster.local:8080/


In [16]:
import sys
!{sys.executable} -m pip install --user --trusted-host pypi-registry.supplementary-services.svc.cluster.local --index http://pypi-registry.supplementary-services.svc.cluster.local:8080/ emoji


Looking in indexes: http://pypi-registry.supplementary-services.svc.cluster.local:8080/


In [17]:
from emoji import get_emoji_regexp, unicode_codes

In [18]:
@udf(returnType=ArrayType(StringType()))
def get_emoji_udf(text):
    all_matches = get_emoji_regexp().finditer(text)
    emoji = []
    for match in all_matches:
        emoji.append(match.group())
    
    return emoji

In [19]:
posts_df.printSchema()

root
 |-- attachments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- album: struct (nullable = true)
 |    |    |    |-- created: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |-- size: long (nullable = true)
 |    |    |    |-- thumb: struct (nullable = true)
 |    |    |    |    |-- access_key: string (nullable = true)
 |    |    |    |    |-- album_id: long (nullable = true)
 |    |    |    |    |-- date: long (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- owner_id: long (nullable = true)
 |    |    |    |    |-- sizes: array (nullable = true)
 |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |-- height: long (nullable = true)
 |    |    |    |    |    |    |-- type: string (nullable = true)
 |   

In [20]:
emoji_df = posts_df.where("text <> ''").select('id', get_emoji_udf(col("text")).name('emoji'))\
        .where(F.size('emoji') > 0)\
        .withColumn('emoji', F.explode("emoji"))\
        .groupBy('emoji')\
        .agg(F.count('id').name('all_count'), F.countDistinct('id').name('posts_count'))\
        .withColumn('avg_per_post', col('all_count') / col('posts_count'))\
        .withColumn('difference', col('all_count') - col('posts_count'))\
        .orderBy(F.desc('all_count'))


emoji_df.limit(5)\
        .toPandas()

Unnamed: 0,emoji,all_count,posts_count,avg_per_post,difference
0,🔥,76,61,1.245902,15
1,⚡,68,45,1.511111,23
2,📍,63,39,1.615385,24
3,🚀,50,47,1.06383,3
4,❤,47,47,1.0,0


Print top 10 most popular emoticons

In [21]:
emoji_df.limit(10)\
        .toPandas()

Unnamed: 0,emoji,all_count,posts_count,avg_per_post,difference
0,🔥,76,61,1.245902,15
1,⚡,68,45,1.511111,23
2,📍,63,39,1.615385,24
3,🚀,50,47,1.06383,3
4,❤,47,47,1.0,0
5,❗,45,20,2.25,25
6,🔵,40,17,2.352941,23
7,⬇,40,16,2.5,24
8,✔,40,9,4.444444,31
9,💙,38,38,1.0,0


Print top 5 emoticons which have the greatest difference between their overall count and frequency

In [22]:
emoji_df.orderBy(F.desc('difference'))\
        .limit(5)\
        .toPandas()

Unnamed: 0,emoji,all_count,posts_count,avg_per_post,difference
0,✔,40,9,4.444444,31
1,❗,45,20,2.25,25
2,⬇,40,16,2.5,24
3,📍,63,39,1.615385,24
4,🔵,40,17,2.352941,23


Print top 5 emoticons with average count per post

In [23]:
emoji_df.orderBy(F.desc('avg_per_post'))\
        .limit(5)\
        .toPandas()

Unnamed: 0,emoji,all_count,posts_count,avg_per_post,difference
0,✔,40,9,4.444444,31
1,🔹,30,7,4.285714,23
2,▪,21,5,4.2,16
3,✒,7,2,3.5,5
4,📷,3,1,3.0,2


## Task 5 
Probable “fans”. Find for each user the top 10 other users whose posts this user likes. 

In [24]:
followers_posts_likes_df.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [25]:
from pyspark.sql.window import Window

In [26]:
followers_posts_likes_df.select(col('ownerId').name('user_id'), 'itemId', 'likerId')\
                        .groupBy('likerId', 'user_id')\
                        .agg(F.count('itemId').name("likes_count"))\
                        .withColumn("top", F.row_number().over(Window.partitionBy("likerId").orderBy(F.desc("likes_count"))))\
                        .where('top <= 10')\
                        .orderBy('likerId', F.desc("likes_count"))\
                        .show()

+-------+--------+-----------+---+
|likerId| user_id|likes_count|top|
+-------+--------+-----------+---+
|      9|  654356|          4|  1|
|     14| 1986125|          2|  1|
|     14|    3420|          1|  2|
|     14|   35524|          1|  3|
|     14| 3680017|          1|  4|
|     15| 1227412|          2|  2|
|     15|  473831|          2|  1|
|     17| 1986125|         10|  1|
|     17|    3420|          3|  2|
|     17|   88060|          2|  3|
|     34|   50601|          1|  2|
|     34| 1986125|          1|  1|
|     63|   73115|          1|  1|
|     99| 1550591|          1|  1|
|    122|   55983|          1|  1|
|    143|     637|          3|  1|
|    146|19515730|          3|  1|
|    146|  106610|          2|  2|
|    149| 1986125|          1|  1|
|    154| 1601367|          1|  1|
+-------+--------+-----------+---+
only showing top 20 rows



## Task 6
Probable friends. If two users like each other posts they may be friends. Find pairs of users where both users are top likers of each other.

In [27]:
posts_likes_df.printSchema()

root
 |-- itemType: string (nullable = true)
 |-- ownerId: integer (nullable = true)
 |-- itemId: integer (nullable = true)
 |-- likerId: integer (nullable = true)



In [6]:
user_df_1 = followers_posts_likes_df\
    .select(col("ownerId").name("user_1"), col("likerId").alias("user_2"), col("itemId"))

user_df_2 = followers_posts_likes_df\
    .select(col("ownerId").name("user_2"), col("likerId").alias("user_1"))

user_df_3 = user_df_1.join(user_df_2, ["user_1", "user_2"]).where("user_1 != user_2")\
    .groupBy("user_1", "user_2").agg(F.count("itemId").alias("cnt_posts"))\
    .orderBy(F.desc("cnt_posts"))

In [9]:
user_df_3.withColumn('first_user', F.least('user_1', 'user_2'))\
         .withColumn('second_user', F.greatest('user_1', 'user_2'))\
         .drop('user_1', 'user_2')\
         .dropDuplicates()\
         .orderBy(F.desc('cnt_posts'))\
         .show()

+---------+----------+-----------+
|cnt_posts|first_user|second_user|
+---------+----------+-----------+
|    16100|  13675440|  183535934|
|     1972|   1475301|    5633955|
|     1612| 207134315|  208946862|
|     1456| 145105762|  267301242|
|      880|    108408|    7697818|
|      810|    135451|   18737802|
|      720| 209077977|  272076217|
|      684| 155963006|  162366815|
|      608|   2547211|    4448812|
|      598|  19261491|  229861638|
|      572|  53368685|  322831238|
|      544|  52612744|   53720099|
|      506|  66022003|   95356919|
|      390|  83892412|  115252127|
|      376|  47122493|   63363182|
|      330|  27102997|   74634237|
|      330|  44770563|  103229751|
|      312| 101767883|  188548515|
|      300|   1475301|    4068532|
|      299|  34892097|   59949877|
+---------+----------+-----------+
only showing top 20 rows

