In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import findspark
findspark.init()

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as func
from pyspark.sql.types import *

# Language processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

# Language processing with TextBlob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

from collections import Counter

In [2]:
# Create spark session
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

# Average over first 8 day of January 2017

In [3]:
nlp_metrics = spark.read.load('../data/daily_2017_01_nlp_metrics.parquet/')

In [20]:
rows = nlp_metrics.collect()
row = rows[0]
row

Row(creation_date=datetime.date(2017, 1, 1), nltk_negativity_60d_avg=180013.25666697026, nltk_neutrality_60d_avg=1776462.7587395571, nltk_positivity_60d_avg=324456.70514096733, text_blob_polarity_60d_avg=20857.047301106853, text_blob_subjectivity_60d_avg=83736.125, nb_bw_matches_60d_avg=217541.375, nb_hw_matches_60d_avg=39619.375, hw_ref_intensity_60d_avg=3832.16623198241, nb_hw_ref_matches_60d_avg=6217.75)

In [24]:
nltk_neg_8_avg = row[1]
nltk_neu_8_avg = row[2]
nltk_pos_8_avg = row[3]
nltk_total = row[1] + row[2] + row[3]
blob_pol_8_avg = row[4]
blob_subj_8_avg = row[5]
bw_matches_8_avg = row[6]
hw_matches_8_avg = row[7]
hw_ref_matches_8_avg = row[8]

In [23]:
print('Over 8 day, the average negativity is {}, neutrality is {}, positivity is {}'.format(row[1], row[2], row[3]))

Over 8 day, the average negativity is 180013.25666697026, neutrality is 1776462.7587395571, positivity is 324456.70514096733


In [25]:
print('Over 8 day, the proportion of negativity is {}, neutrality is {}, positivity is {}'.format(row[1]/nltk_total, row[2]/nltk_total, row[3]/nltk_total))

Over 8 day, the proportion of negativity is 0.07892089715989585, neutrality is 0.778831722100576, positivity is 0.1422473807395282


Therefore, we notice that positivity in posts is two times more present that negativity.

# First 5 days of January 2017

In [26]:
nlp_metrics_full = spark.read.load('../data/daily_nlp_metrics_full.parquet/')

In [27]:
nlp_metrics_full.show()

+-------+-------------+--------------------+---------------+---------------+---------------+------------------+----------------------+-------------+-------------+----------------+-----------------+
|     id|creation_date|                body|nltk_negativity|nltk_neutrality|nltk_positivity|text_blob_polarity|text_blob_subjectivity|nb_bw_matches|nb_hw_matches|hw_ref_intensity|nb_hw_ref_matches|
+-------+-------------+--------------------+---------------+---------------+---------------+------------------+----------------------+-------------+-------------+----------------+-----------------+
|dby6jwk|   2017-01-03|GoW 4 was release...|            0.0|            1.0|            0.0|               0.0|                   0.0|          0.0|          0.0|             0.0|              0.0|
|dby6jwl|   2017-01-03|Firmino really do...|            0.0|            1.0|            0.0|               0.0|                   0.0|          0.0|          0.0|             0.0|              0.0|
|dby6jwm| 

In [37]:
nlp_per_day = nlp_metrics_full.withColumn('dummy', func.lit(1)).groupBy('creation_date').sum()

In [40]:
pd_per_day = nlp_per_day.toPandas()

In [41]:
pd_per_day

Unnamed: 0,creation_date,sum(nltk_negativity),sum(nltk_neutrality),sum(nltk_positivity),sum(text_blob_polarity),sum(text_blob_subjectivity),sum(nb_bw_matches),sum(nb_hw_matches),sum(hw_ref_intensity),sum(nb_hw_ref_matches),sum(dummy)
0,2017-01-04,188601.709181,1917216.0,344668.157413,22675.253298,91393.0,228173.0,42919.0,3488.480983,5625.0,2454012
1,2017-01-05,199845.992185,1950554.0,349086.348415,23150.745999,92283.0,243531.0,45370.0,6396.886955,10575.0,2503011
2,2017-01-01,152172.005136,1448465.0,276893.661343,16782.196386,67774.0,177666.0,30003.0,2423.016994,3938.0,1881201
3,2017-01-02,169650.015161,1680213.0,308789.881374,19915.737242,79217.0,204315.0,37675.0,3248.180991,5217.0,2162208
4,2017-01-03,186446.544172,1887507.0,342307.846394,21948.237553,89255.0,223111.0,41885.0,3765.949985,5986.0,2419728


In [51]:
pd_per_day.dtypes

creation_date                   object
sum(nltk_negativity)           float64
sum(nltk_neutrality)           float64
sum(nltk_positivity)           float64
sum(text_blob_polarity)        float64
sum(text_blob_subjectivity)    float64
sum(nb_bw_matches)             float64
sum(nb_hw_matches)             float64
sum(hw_ref_intensity)          float64
sum(nb_hw_ref_matches)         float64
sum(dummy)                       int64
dtype: object

In [52]:
pd_percentage = pd_per_day.iloc[:, 1:-1].div(pd_per_day.iloc[:, -1], axis=0)

In [53]:
pd_percentage

Unnamed: 0,sum(nltk_negativity),sum(nltk_neutrality),sum(nltk_positivity),sum(text_blob_polarity),sum(text_blob_subjectivity),sum(nb_bw_matches),sum(nb_hw_matches),sum(hw_ref_intensity),sum(nb_hw_ref_matches)
0,0.076854,0.781258,0.140451,0.00924,0.037242,0.09298,0.017489,0.001422,0.002292
1,0.079842,0.779283,0.139467,0.009249,0.036869,0.097295,0.018126,0.002556,0.004225
2,0.080891,0.769968,0.14719,0.008921,0.036027,0.094443,0.015949,0.001288,0.002093
3,0.078461,0.777082,0.142812,0.009211,0.036637,0.094494,0.017424,0.001502,0.002413
4,0.077053,0.780049,0.141465,0.009071,0.036886,0.092205,0.01731,0.001556,0.002474
