In [2]:
import findspark
findspark.init()
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import datetime, time 

In [4]:
spark = SparkSession.builder.enableHiveSupport().master("local[*]").appName("Twitter Stat").getOrCreate() 

In [20]:
data = spark.read.parquet("../data/twitter_sentiment.parquet").drop('__index_level_0__').cache()

In [102]:
splitting2 = udf(lambda t: [(int(x.split(' : ')[0]) if x.split(' : ')[0]!='None' else int(-1)) for x in t.split(', ') if t is not None], ArrayType(IntegerType()))

In [108]:
data2 = data.filter(col("retweets_id").isNotNull())\
.withColumn('retweeted_by', splitting2('retweets_id'))

In [105]:
data.filter(col("retweets_id").isNotNull()).select('retweets_id').show(100)

+--------------------+
|         retweets_id|
+--------------------+
|170519827 : Nobuh...|
|77109517 : GiLyn0...|
|82935160 : iAmCar...|
|82935160 : iAmCar...|
|82935160 : iAmCar...|
|2256452099 : kere...|
|171761415 : kenzo...|
|132059870 : passi...|
|132059870 : passi...|
|132059870 : passi...|
|110195545 : herli...|
|132059870 : passi...|
|4864223733 : idek...|
|116242899 : Math_...|
|73769045 : deijah...|
|         None : None|
|634640412 : s_log...|
|         None : None|
|   550440107 : _OHok|
|2808093521 : buen...|
|79877567519262311...|
|127642577 : uhcry...|
|    22461067 : Eat24|
|63032383 : alrush...|
|74002222 : kikilapew|
|     18997982 : Nace|
|31184208 : wouldn...|
|490025224 : beach...|
|220584725 : Indig...|
|485895136 : jakcb...|
|169763569 : DezGr...|
|346224494 : Music...|
|272777994 : Cilla...|
|220584725 : Indig...|
|130429504 : mynam...|
|424026099 : JustK...|
|194339143 : PRang...|
|85564783 : ecowch...|
|52235078 : a_ubes...|
|372622706 : MayaH...|
|55150831 :

In [112]:
data2.withColumn('retweet_by', explode(col('retweeted_by'))).select('username', 'retweet_by').show(100)

+---------------+-----------+
|       username| retweet_by|
+---------------+-----------+
|          Oprah|  170519827|
|          Oprah|  120973356|
|          Oprah|   60787363|
|          Oprah|  116340070|
|          Oprah|   34987008|
|          Oprah|  156160449|
|          Oprah|   44327116|
|          Oprah|  246709170|
|          Oprah|  214405332|
|          Oprah|  245554706|
|          Oprah|  105950414|
|          Oprah|  133736948|
|          Oprah|  225810702|
|          Oprah|  184817646|
|          Oprah|  244965660|
|          Oprah|  194991720|
|          Oprah|  185213404|
|          Oprah|  244013853|
|          Oprah|  241762756|
|          Oprah|   19465759|
|          Oprah|  243036302|
|          Oprah|  242640451|
|          Oprah|   77350975|
|          Oprah|  104064055|
|          Oprah|  167673161|
|          Oprah|   77109517|
|          Oprah|  241462731|
|          Oprah|  165911293|
|          Oprah|  170519827|
|          Oprah|  257592361|
|         

In [43]:
data.select('retweeted_by').show(10)

AnalysisException: "cannot resolve '`retweeted_by`' given input columns: [date_time, username, id, reply_to, url, likes_id, mod_text, positive, retweets, retweets_id, replies, favorites, negative, picture];;\n'Project ['retweeted_by]\n+- AnalysisBarrier\n      +- Project [id#912L, username#913, date_time#914, url#915, reply_to#916, retweets#917L, favorites#918L, likes_id#919, retweets_id#920, picture#921, replies#922L, mod_text#923, positive#924, negative#925]\n         +- Relation[id#912L,username#913,date_time#914,url#915,reply_to#916,retweets#917L,favorites#918L,likes_id#919,retweets_id#920,picture#921,replies#922L,mod_text#923,positive#924,negative#925,__index_level_0__#926L] parquet\n"

In [None]:
wind = Window 
    .partitionBy(col("username")) \
    .rangeBetween(-sys.maxsize, sys.maxsize)

In [None]:
data = data.withColumn("variance", variance("replies").over(Window.partitionBy(col("username"))))

In [None]:
data.select('username', 'replies', 'variance').show(10)

## Length of tweets

In [None]:
_sizes2 = data.filter(col("len").isNotNull()).filter(col("date_time")>datetime.datetime(2017, 11, 7, 0, 0, 0)).select(col("len")).collect()

In [None]:
_sizes_old = data.filter(col("len").isNotNull()).filter(col("date_time")<datetime.datetime(2017, 11, 1, 0, 0, 0)).select(col("len")).collect()

# Visualisation

In [None]:
from __future__ import absolute_import
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import os
# SEA-BORN Modification
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-paper'])
    matplotlib.rc("font", family="Times New Roman")
    sns.set(rc={'figure.figsize': (12, 8)})
    sns.set(font_scale=1.8)
    sns.set_style("whitegrid")

In [None]:
set_style()
def plot_Hist(data, save=False, bins=None, x_lim=(0, 320), color='firebrick', title='Lenght of Twitts', _xlabel='Lenght', yscale='linear'):
    ax = sns.distplot(data, bins=bins, color=color, norm_hist=False, kde=False, hist_kws={"alpha": 0.8},
                      label='Sample')
    ax.set_yscale(yscale)
    ax.set(xlabel=_xlabel, ylabel='Num. of tweets, '+yscale+' scale')
    plt.plot([140, 140], [0, 350000], ls='--', linewidth=2, color='#0084b4')
    plt.xlim(x_lim)
    plt.legend(loc=1)
    plt.tight_layout()
    if save:
        if not os.path.exists('Images/Histograms/'):
            os.makedirs('Images/Histograms/')
        plt.savefig('Images/Histograms/' + title + '.png')
    plt.show()

In [None]:
plot_Hist(_sizes_old, color='#00aced', save=True)

In [None]:
plot_Hist(_sizes2, color='#00aced', save=True, title='280_limit')

In [None]:
plot_Hist(cnt_words, bins=16, color="lightblue", x_lim=(0, 60), _xlabel='Cnt of words')