In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import datetime, time 

In [3]:
spark = SparkSession.builder.enableHiveSupport().master("local[*]").appName("Twitter Stat").getOrCreate() 

In [71]:
data = spark.read.parquet("../data/twitter_sentiment.parquet").drop('__index_level_0__').cache()

## Data modification, features extraction

In [104]:
wind = Window\
    .partitionBy(col("username"))
splitting = udf(lambda t: [(int(x.split(' : ')[0]) if x.split(' : ')[0]!='None' \
                            else int(-1)) for x in t.split(', ') if t!=-2], ArrayType(IntegerType()))

In [111]:
result = data.withColumn("id", col("id").cast(LongType()))\
            .withColumn("Positive", 
                       struct(
                        col("positive_score").alias('score'),
                        variance(col("positive_score")).over(wind).alias("var"),
                        mean(col("positive_score")).over(wind).alias("mean"),
                        max(col("positive_score")).over(wind).alias("max"),
                        min(col("positive_score")).over(wind).alias("min"),
                        count(col("positive_score")).over(wind).alias("cnt"))).drop("positive_score")\
            .withColumn("Neutral", 
                       struct(
                        col("neutral_score").alias('score'),
                        variance(col("neutral_score")).over(wind).alias("var"),
                        mean(col("neutral_score")).over(wind).alias("mean"),
                        max(col("neutral_score")).over(wind).alias("max"),
                        min(col("neutral_score")).over(wind).alias("min"),
                        count(col("neutral_score")).over(wind).alias("cnt"))).drop("neutral_score")\
            .withColumn("Negative", 
                       struct(
                        col("negative_score").alias('score'),
                        variance(col("negative_score")).over(wind).alias("var"),
                        mean(col("negative_score")).over(wind).alias("mean"),
                        max(col("negative_score")).over(wind).alias("max"),
                        min(col("negative_score")).over(wind).alias("min"),
                        count(col("negative_score")).over(wind).alias("cnt"))).drop("negative_score")\
            .na.fill({'retweets_id': -2, 'likes_id': -2})\
                    .withColumn('retweeted_by', splitting('retweets_id'))\
                    .withColumn('liked_by', splitting('likes_id')).drop('retweets_id', 'likes_id')

In [115]:
result\
    .write.mode("overwrite")\
    .parquet("../data/twitter_v2.parquet")

In [116]:
t = spark.read.parquet("../data/twitter_v2.parquet").cache()

In [118]:
t.printSchema()

root
 |-- username: string (nullable = true)
 |-- id: long (nullable = true)
 |-- text: string (nullable = true)
 |-- date_time: timestamp (nullable = true)
 |-- url: string (nullable = true)
 |-- reply_to: double (nullable = true)
 |-- retweets: double (nullable = true)
 |-- favorites: double (nullable = true)
 |-- picture: string (nullable = true)
 |-- replies: double (nullable = true)
 |-- x: string (nullable = true)
 |-- name: string (nullable = true)
 |-- tweets_number: double (nullable = true)
 |-- followers_number: double (nullable = true)
 |-- following_number: double (nullable = true)
 |-- favorites_number: double (nullable = true)
 |-- bio: string (nullable = true)
 |-- place: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- site: string (nullable = true)
 |-- birth: string (nullable = true)
 |-- creation: string (nullable = true)
 |-- Positive: struct (nullable = true)
 |    |-- score: double (nullable = true)
 |    |-- var: double (nullable = true)
 |  

## Length of tweets

In [None]:
_sizes2 = data.filter(col("len").isNotNull()).filter(col("date_time")>datetime.datetime(2017, 11, 7, 0, 0, 0)).select(col("len")).collect()

In [None]:
_sizes_old = data.filter(col("len").isNotNull()).filter(col("date_time")<datetime.datetime(2017, 11, 1, 0, 0, 0)).select(col("len")).collect()

# Visualisation

In [None]:
from __future__ import absolute_import
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import os
# SEA-BORN Modification
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-paper'])
    matplotlib.rc("font", family="Times New Roman")
    sns.set(rc={'figure.figsize': (12, 8)})
    sns.set(font_scale=1.8)
    sns.set_style("whitegrid")

In [None]:
set_style()
def plot_Hist(data, save=False, bins=None, x_lim=(0, 320), color='firebrick', title='Lenght of Twitts', _xlabel='Lenght', yscale='linear'):
    ax = sns.distplot(data, bins=bins, color=color, norm_hist=False, kde=False, hist_kws={"alpha": 0.8},
                      label='Sample')
    ax.set_yscale(yscale)
    ax.set(xlabel=_xlabel, ylabel='Num. of tweets, '+yscale+' scale')
    plt.plot([140, 140], [0, 350000], ls='--', linewidth=2, color='#0084b4')
    plt.xlim(x_lim)
    plt.legend(loc=1)
    plt.tight_layout()
    if save:
        if not os.path.exists('Images/Histograms/'):
            os.makedirs('Images/Histograms/')
        plt.savefig('Images/Histograms/' + title + '.png')
    plt.show()

In [None]:
plot_Hist(_sizes_old, color='#00aced', save=True)

In [None]:
plot_Hist(_sizes2, color='#00aced', save=True, title='280_limit')

In [None]:
plot_Hist(cnt_words, bins=16, color="lightblue", x_lim=(0, 60), _xlabel='Cnt of words')