In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql import Row,SQLContext
from pyspark.sql.functions import col,desc,substring,lit,udf,length
from pyspark.sql.types import StructType,StructField,IntegerType,DoubleType,StringType
from pyspark.sql.functions import split, explode, monotonically_increasing_id, substring_index
import pyspark.sql.functions as f

In [2]:
import sys
import os
from operator import add
import pandas as pd

In [3]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.append("")

In [4]:
spark=SparkSession.builder\
    .config("spark.debug.maxToStringFields", 100000)\
    .appName("hw2")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2021-11-21 23:06:35,763 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sc=spark.sparkContext
sqlContext = SQLContext(sc)



In [6]:
path = "/home/osboxes/hw/hw2/"
if not os.path.isdir(path+"outputs"): os.mkdir(path+"outputs")

In [7]:
df_news = spark.read.format("csv").option("mode", "DROPMALFORMED").option("header", "true").load("file:"+path+"datasets/News_Final.csv", inferSchema='true')

                                                                                

In [8]:
#pd_df_news = pd.read_csv("datasets/News_Final.csv", encoding = 'utf8')df_news = spark.read.csv("file:"+path+"datasets/News_Final.csv",header='true', inferSchema='true')

In [9]:
df_news = df_news.withColumn('SentimentTitle', df_news['SentimentTitle'].cast('double'))
df_news = df_news.withColumn('SentimentHeadline', df_news['SentimentHeadline'].cast('double'))
df_news = df_news.withColumn('PublishDate', df_news['PublishDate'].cast('date'))
df_news = df_news.withColumn('PublishDate', df_news['PublishDate'].cast('string'))

In [10]:
df_news.filter(df_news.Headline.isNull()).count()

                                                                                

15

In [11]:
df_news.filter(df_news.Source.isNull()).count()

                                                                                

279

In [12]:
df_news = df_news.na.fill("missing")

In [13]:
df_news.filter(df_news.Headline.isNull()).count()

0

In [14]:
df_news.filter(df_news.Source.isNull()).count()

0

# Q1

In [15]:
if not os.path.isdir(path+"outputs/q1"): os.mkdir(path+"outputs/q1")

In [16]:
df_news_title_total = df_news.select('Title')
df_news_head_total = df_news.select('Headline')

In [17]:
def lower_clean_str(x):
    punc='!"#”$%&\'()*+—–,./:;<=>?@[\\]^_’‘`{|}~-…'
    #punc = '''!()-—–[]{};:”'"\[\\]{|}, <>.…/?+@#$%^&*_~\n=’‘'''
    lowercased_str = x.lower()
    for ch in punc:
        lowercased_str = lowercased_str.replace(ch, '')
    return lowercased_str
def handle(df_news_total):
    rdd_lines = df_news_total.rdd.map(lambda r: lower_clean_str(r[0]))
    rdd_news_counts = rdd_lines.flatMap(lambda x: x.split(' ')) \
                      .map(lambda x: (x, 1)) \
                      .reduceByKey(add).filter(lambda x: x[0] not in stopwords).toDF(("word", "total"))
    rdd_news_counts = rdd_news_counts.orderBy(desc("total"), "word")
    return rdd_news_counts

In [18]:
rdd_news_title_total = handle(df_news_title_total)
rdd_news_head_total = handle(df_news_head_total)
#use spark write csv will repartition to worker folder, so use pandas to_csv to output
rdd_news_title_total.toPandas().to_csv(path+"outputs/q1/q1_news_title_total.csv",index=False)
rdd_news_head_total.toPandas().to_csv(path+"outputs/q1/q1_news_headline_total.csv",index=False)
#rdd_news_title_total.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_title_total.csv")
#rdd_news_head_total.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_headline_total.csv")

print("Title total")
rdd_news_title_total.show(20)
print("Headline total")
rdd_news_head_total.show(20)

                                                                                

Title total


                                                                                

+-----------+-----+
|       word|total|
+-----------+-----+
|    economy|26198|
|      obama|22576|
|  microsoft|17570|
|     obamas| 5203|
|         us| 4713|
|  palestine| 3812|
|        new| 3740|
|       says| 3518|
|  president| 3039|
|   economic| 2951|
| microsofts| 2939|
|    windows| 2734|
|       2016| 2428|
|     global| 2197|
|         10| 2150|
|     growth| 1936|
|      trump| 1721|
|palestinian| 1599|
|      china| 1543|
|      could| 1361|
+-----------+-----+
only showing top 20 rows

Headline total


[Stage 33:>                                                         (0 + 3) / 3]

+----------+-----+
|      word|total|
+----------+-----+
|     obama|26004|
|   economy|25211|
| president|22494|
| microsoft|20551|
|    barack|13091|
|        us| 9940|
|      said| 9342|
|       new| 9267|
|  economic| 8942|
|      year| 5687|
|     first| 5073|
|       one| 4902|
|   windows| 4736|
| palestine| 4694|
|    growth| 4633|
|    obamas| 4203|
|      last| 4109|
|      2016| 3984|
|washington| 3937|
|    global| 3833|
+----------+-----+
only showing top 20 rows





In [19]:
topic_list = ['economy','microsoft','obama','palestine']

In [20]:
lower_clean_str_udf = udf(lower_clean_str)

In [21]:
#lower word and remove symbol
df_news = df_news.withColumn('Title',lower_clean_str_udf(df_news["Title"]))
df_news = df_news.withColumn('Headline',lower_clean_str_udf(df_news["Headline"]))

In [22]:
df_news_cate = df_news.select('Title','Headline', 'Topic').where(df_news["Topic"].isin(topic_list))
df_news_date = df_news.select('Title','Headline', 'PublishDate').where((df_news['PublishDate']!="missing")&(df_news["Topic"].isin(topic_list)))

In [23]:
df_news_title_total_cate = df_news_cate.withColumn('Total_Title_cate',f.explode(f.split(f.column('Title'), ' ')))
#filter stopwords
df_news_title_total_cate = df_news_title_total_cate.filter(~df_news_title_total_cate["Total_Title_cate"].isin(stopwords))\
                        .groupBy('Topic' ,'Total_Title_cate')\
                        .count()\
                        .sort(['Topic','count'],ascending=[True,False])
df_news_head_total_cate = df_news_cate.withColumn('Total_Headline_cate',f.explode(f.split(f.column('Headline'), ' ')))
#filter stopwords
df_news_head_total_cate = df_news_head_total_cate.filter(~df_news_head_total_cate["Total_Headline_cate"].isin(stopwords))\
                        .groupBy('Topic' ,'Total_Headline_cate')\
                        .count()\
                        .sort(['Topic','count'],ascending=[True,False])
#use spark write csv will repartition to worker folder, so use pandas to_csv to output
df_news_title_total_cate.toPandas().to_csv(path+"outputs/q1/q1_news_title_total_cate.csv",index=False)
df_news_head_total_cate.toPandas().to_csv(path+"outputs/q1/q1_news_headline_total_cate.csv",index=False)
#df_news_title_total_cate.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_title_total_cate.csv")
#df_news_head_total_cate.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_headline_total_cate.csv")
print("title total by cate")
df_news_title_total_cate.show(10)
print("head total by cate")
df_news_head_total_cate.show(10)

                                                                                

title total by cate


                                                                                

+-------+----------------+-----+
|  Topic|Total_Title_cate|count|
+-------+----------------+-----+
|economy|         economy|24685|
|economy|              us| 2773|
|economy|        economic| 2623|
|economy|          global| 1864|
|economy|          growth| 1747|
|economy|            says| 1626|
|economy|           china| 1204|
|economy|            2016| 1112|
|economy|          chinas| 1074|
|economy|             new|  999|
+-------+----------------+-----+
only showing top 10 rows

head total by cate




+-------+-------------------+-----+
|  Topic|Total_Headline_cate|count|
+-------+-------------------+-----+
|economy|            economy|23931|
|economy|           economic| 8232|
|economy|             growth| 4351|
|economy|               said| 4131|
|economy|                 us| 3605|
|economy|               year| 3381|
|economy|             global| 3075|
|economy|                new| 2754|
|economy|            percent| 2573|
|economy|            quarter| 2183|
+-------+-------------------+-----+
only showing top 10 rows



                                                                                

In [24]:
df_news_title_total_date = df_news_date.withColumn('Total_Title_day',f.explode(f.split(f.column('Title'), ' ')))
#filter stopwords
df_news_title_total_date = df_news_title_total_date.filter(~df_news_title_total_date["Total_Title_day"].isin(stopwords))\
                        .groupBy('PublishDate' ,'Total_Title_day')\
                        .count()\
                        .sort(['PublishDate','count'],ascending=[True,False])
df_news_head_total_date = df_news_date.withColumn('Total_Headline_day',f.explode(f.split(f.column('Headline'), ' ')))
#filter stopwords
df_news_head_total_date = df_news_head_total_date.filter(~df_news_head_total_date["Total_Headline_day"].isin(stopwords))\
                        .groupBy('PublishDate' ,'Total_Headline_day')\
                        .count()\
                        .sort(['PublishDate','count'],ascending=[True,False])
#use spark write csv will repartition to worker folder, so use pandas to_csv to output
df_news_title_total_date.toPandas().to_csv(path+"outputs/q1/q1_news_title_total_date.csv",index=False)
df_news_head_total_date.toPandas().to_csv(path+"outputs/q1/q1_news_headline_total_date.csv",index=False)
#df_news_title_total_date.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_title_total_date.csv")
#df_news_head_total_date.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q1/q1_news_headline_total_date.csv")
print("title total by day")
df_news_title_total_date.show(10)
print("head total by day")
df_news_head_total_date.show(10)

                                                                                

title total by day


                                                                                

+-----------+---------------+-----+
|PublishDate|Total_Title_day|count|
+-----------+---------------+-----+
| 2002-04-02|         wreath|    1|
| 2002-04-02|       cemetery|    1|
| 2002-04-02|       national|    1|
| 2002-04-02|           lays|    1|
| 2002-04-02|          obama|    1|
| 2002-04-02|      arlington|    1|
| 2008-09-20|           look|    1|
| 2008-09-20|        economy|    1|
| 2008-09-20|         health|    1|
| 2008-09-20|        chinese|    1|
+-----------+---------------+-----+
only showing top 10 rows

head total by day




+-----------+------------------+-----+
|PublishDate|Total_Headline_day|count|
+-----------+------------------+-----+
| 2002-04-02|            wreath|    2|
| 2002-04-02|             obama|    2|
| 2002-04-02|          cemetery|    1|
| 2002-04-02|            barack|    1|
| 2002-04-02|          unknowns|    1|
| 2002-04-02|             honor|    1|
| 2002-04-02|              tomb|    1|
| 2002-04-02|              laid|    1|
| 2002-04-02|         president|    1|
| 2002-04-02|         arlington|    1|
+-----------+------------------+-----+
only showing top 10 rows





# Q2

In [25]:
platform_list = ["Facebook","GooglePlus","LinkedIn"]

In [26]:
if not os.path.isdir(path+"outputs/q2"): os.mkdir(path+"outputs/q2")

In [27]:
header_per_day=['IDLink'] + ['TS'+str((count+1)*72) for count in range(2)]
header_per_hour=['IDLink'] + ['TS'+str((count+1)*3) for count in range(48)]

In [28]:
for platform in platform_list:
    df_plat = spark.read.csv("file:"+path+"datasets/"+platform+"_*.csv",header='true', inferSchema='true')
    df_popular_hour=df_plat.select(header_per_hour).rdd.map(list)\
                                               .flatMap(lambda x:((x[0], element) for element in x[1:]))\
                                               .reduceByKey(add).map(lambda x:(x[0], x[1]/48)).sortByKey()\
                                               .map(lambda x:('ID'+str(int(x[0])), x[1])).toDF(("ID", "average"))
    df_popular_day=df_plat.select(header_per_day).rdd.map(list)\
                                              .flatMap(lambda x:((x[0], element) for element in x[1:]))\
                                              .reduceByKey(add).map(lambda x:(x[0], x[1]/2)).sortByKey()\
                                              .map(lambda x:('ID'+str(int(x[0])), x[1])).toDF(("ID", "average"))#\
    #use spark write csv will repartition to worker folder, so use pandas to_csv to output
    #df_popular_hour.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q2/q2_popular_hour")
    #df_popular_day.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"/outputs/q2/q2_popular_day")
    df_popular_hour.toPandas().to_csv(path+"/outputs/q2/df_popular_hour.csv",index=False)
    df_popular_day.toPandas().to_csv(path+"/outputs/q2/df_popular_day.csv",index=False)
    print(platform+"\nhour\n")
    print(df_popular_hour.show(10))
    print(platform+"\nday\n")
    print(df_popular_day.show(10))

                                                                                

Facebook
hour

+----+--------------------+
|  ID|             average|
+----+--------------------+
| ID1|  10.541666666666666|
| ID2|  28.958333333333332|
| ID3|  59.708333333333336|
| ID4|  3.6666666666666665|
| ID5|  20.354166666666668|
| ID6|   99.14583333333333|
| ID7|               9.875|
| ID8|-0.16666666666666666|
| ID9|   8.520833333333334|
|ID10|   73.70833333333333|
+----+--------------------+
only showing top 10 rows

None
Facebook
day

+----+-------+
|  ID|average|
+----+-------+
| ID1|   12.5|
| ID2|   39.0|
| ID3|   90.0|
| ID4|    6.0|
| ID5|   31.0|
| ID6|  127.0|
| ID7|   12.0|
| ID8|    0.0|
| ID9|   12.0|
|ID10|  116.0|
+----+-------+
only showing top 10 rows

None


                                                                                

GooglePlus
hour

+----+--------------------+
|  ID|             average|
+----+--------------------+
| ID1|               0.375|
| ID2|-0.04166666666666...|
| ID3|  15.229166666666666|
| ID4|              0.0625|
| ID5|             -0.3125|
| ID6|               36.75|
| ID7|-0.02083333333333...|
| ID8|-0.16666666666666666|
| ID9|                 0.5|
|ID10|  1.1666666666666667|
+----+--------------------+
only showing top 10 rows

None
GooglePlus
day

+----+-------+
|  ID|average|
+----+-------+
| ID1|    0.5|
| ID2|    0.0|
| ID3|   22.5|
| ID4|    0.5|
| ID5|    0.0|
| ID6|   47.0|
| ID7|    0.0|
| ID8|    0.0|
| ID9|    1.0|
|ID10|    2.0|
+----+-------+
only showing top 10 rows

None


                                                                                

LinkedIn
hour

+----+--------------------+
|  ID|             average|
+----+--------------------+
| ID1|  1.4791666666666667|
| ID2|-0.04166666666666...|
| ID3|             -0.3125|
| ID4|   6.729166666666667|
| ID5|             -0.3125|
| ID6|              70.125|
| ID7|-0.02083333333333...|
| ID8|-0.16666666666666666|
| ID9|   7.604166666666667|
|ID10|  13.166666666666666|
+----+--------------------+
only showing top 10 rows

None
LinkedIn
day

+----+-------+
|  ID|average|
+----+-------+
| ID1|    2.0|
| ID2|    0.0|
| ID3|    0.0|
| ID4|   10.0|
| ID5|    0.0|
| ID6|   99.5|
| ID7|    0.0|
| ID8|    0.0|
| ID9|   11.0|
|ID10|   27.5|
+----+-------+
only showing top 10 rows

None


# Q3

In [29]:
if not os.path.isdir(path+"outputs/q3"): os.mkdir(path+"outputs/q3")

In [30]:
def handle_senti(type1,type2):
    df_news_senti = df_news.where(df_news["Topic"].isin(topic_list)).select('Topic',type1,type2)
    tmp = df_news_senti.groupBy("Topic").sum(type2).orderBy("Topic").toPandas()
    df_seni_topic = list(tmp["Topic"])
    df_seni_sum = list(tmp["sum("+type2+")"])
    df_seni_avg = list(df_news_senti.groupBy("Topic").avg(type2).orderBy("Topic").toPandas()['avg('+type2+')'])
    pdf_news_senti_out = pd.DataFrame(zip(df_seni_topic,df_seni_sum,df_seni_avg),columns=['topic','sum','average'])
    #use spark write csv will repartition to worker folder, so use pandas to_csv to output
    pdf_news_senti_out.to_csv(path+"outputs/q3/q3_"+type1+"_sentiment_score.csv", index=False)
    pdf_news_senti_out = spark.createDataFrame(pdf_news_senti_out)
    #pdf_news_senti_out.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"outputs/q3/q3_"+type1+"_sentiment_score")
    print(type2)
    pdf_news_senti_out.show(10)

In [31]:
handle_senti("Title","SentimentTitle")
handle_senti("Headline","SentimentHeadline")

                                                                                

SentimentTitle
+---------+-------------------+--------------------+
|    topic|                sum|             average|
+---------+-------------------+--------------------+
|  economy| -336.9370044373348|-0.01047591967283...|
|microsoft|  49.43849052234931| 0.00231042576513456|
|    obama|-15.743686315455399|-5.81484259111926...|
|palestine|-164.48440896913743|-0.01986526678371...|
+---------+-------------------+--------------------+



                                                                                

SentimentHeadline
+---------+-------------------+--------------------+
|    topic|                sum|             average|
+---------+-------------------+--------------------+
|  economy|-1271.3909442082195|-0.03952961304008...|
|microsoft|-318.81900083682035|-0.01489947662570...|
|    obama| -481.8388358106563|-0.01779644822938712|
|palestine|-363.16995277670947|-0.04386110540781...|
+---------+-------------------+--------------------+



# Q4

In [32]:
if not os.path.isdir(path+"outputs/q4"): os.mkdir(path+"outputs/q4")

In [33]:
list_top100 = []
def remove_others(string):
    global list_top100
    lowercased_str = string.lower()
    punc='!"#”$%&\'()*+—–./:;<=>?@[\\]^_’ ‘`{|}~-…'
    for ch in punc:
        lowercased_str = lowercased_str.replace(ch, ',')
    tsets = lowercased_str.split(',')
    alist = [x for x in tsets if x in list_top100]
    if not alist:
        alist = ["No Values"]
    return ','.join(alist)
def handle_type_co_occurrence(df_word_count,df_news_type,topic,type1):
    global list_top100
    list_top100 = list(df_word_count.select('Total_'+type1+'_cate').toPandas()['Total_'+type1+'_cate'][0:100])

    remove_others_udf = udf(remove_others)

    df_news_type_new = df_news_type.withColumn("New_sentence", remove_others_udf(df_news_type[type1])).select("New_sentence").where(col('New_sentence') != "No Values")
    
    tmp_df = (df_news_type_new.withColumn("id", monotonically_increasing_id()).select("id", f.explode(f.split("New_sentence", ","))))
    df_occurrence_matrix = tmp_df.withColumnRenamed("col", "col_").join(tmp_df, ["id"]).stat.crosstab("col_", "col")
    #use spark write csv will repartition to worker folder, so use pandas to_csv to output
    #df_occurrence_matrix.coalesce(1).write.mode('overwrite').option("header",True).csv("file://"+path+"outputs/q4/q4_"+type1+"_"+topic+"_co-occurrence_matrix")
    df_occurrence_matrix.toPandas().to_csv(path+"outputs/q4/q4_"+type1+"_"+topic+"_co-occurrence_matrix.csv", index=False)
    print("Matrix: "+topic+" "+type1)
    df_occurrence_matrix.show(20)

In [34]:
for topic in topic_list:
    print(topic)
    tmp_df_title = df_news_cate.select('Title') .where(col('Topic') == topic)
    tmp_df_head = df_news_cate.select('Headline').where(col('Topic') == topic)
    handle_type_co_occurrence(df_news_title_total_cate.where(df_news_title_total_cate["Topic"] == topic),tmp_df_title,topic,'Title')
    handle_type_co_occurrence(df_news_head_total_cate.where(df_news_head_total_cate["Topic"] == topic),tmp_df_head,topic,'Headline')

economy


2021-11-21 23:10:37,978 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Matrix: economy Title
+---------+----+----+-----+----+----+---+----+------+---+-----+------+--------+-------+-----+------+-------+-----+------+----+-------+-------+--------+-------+--------+---+--------+---+-----+-----+----+---+------+---+------+----+----------+----+----+-------+-----+------+----+---+---+------+-----+------+-----+------+----+-----+---+------+-------+---+--------+---+-----+---+-------+---+-------+----+------+---------+------+-------+----+-----+---------+------+-----+-------+-----+----+----+-------+-----+-----+-------+-----+-----+--------+-----+-----+------+------+---+---+-----+-----+---+----+-----+----+-----+-----+----+-----+
| col__col|2015|2016|ahead|amid|back|bad|bank|better|big|boost|brexit|business|central|china|chinas|chinese|could|crisis|data|despite|digital|economic|economy|economys| eu|expected|fed|first|focus|fuel|gdp|german|gig|global|good|government|grew|grow|growing|grows|growth|help|hit|imf|impact|india|indian|japan|japans|jobs|local|low|market|markets|may

  df[column_name] = series


Matrix: economy Headline
+---------+----+----+---------+----+----+--------+----+-------+-----+------+-------+-----+-------+--------+----+-------+-----------+--------+--------+---------+-------+--------+--------+-------+-------+---------+-----+------+------+----------+----+----+-------+------+------+--------+-------------+----------+----+----+---+----+------+-------+---+--------+------+-----+------+--------+----+----+----+---+-------+----+------+----+-------+------+---------+------+-----+-------+----+-----+------+---------+------+-------+-------+----+----+------+------+------+-----+--------+--------+-----+------+-----+------+-----+-----+--------+----+-----+-------+---+---+-----+----+---------+----+-----+------+-----+----+-----+
| col__col|2015|2016|according|also|bank|business|cent|central|china|chinas|chinese|could|country|countrys|data|despite|development|domestic|economic|economies|economy|european|expected|federal|finance|financial|first|friday|global|government|grew|grow|growing|gr

                                                                                

Matrix: microsoft Title
+----------+---+---+----+---+---+---+---+---+----+---+------+-------+---------+---+-----+----+---------+-----+----+---+-------+----+-----+--------+---+---+-----+-----------+-------+-----+----+----+--------+----+----------+--------+-----+----+----+-----+---+----+------+----+-----+--------+--------+---+------+------+--------+--------+----+-----+----+---+---------+----------+------+----+-------+---+----+------+---+----+---+-----+-----+-------+---+-------+-------+--------+------+-----+----+--------+------+-------+--------+-----+--------+-----+-----+-------+-------+----+----------+---+------+-------+-------+---+-----+-----+---+-----+-------+----+
|  col__col| 10|  2|2016|  3|365|  4|  5|950|adds| ai|amazon|android|announces|app|apple|apps|available|azure|band|big|billion|book|build|business|buy|ceo|cloud|corporation|cortana|could|data|deal|dynamics|edge|enterprise|facebook|first|free|game|games|get|gets|google|help|heres|hololens|internet|ios|iphone|latest|launches|l

                                                                                

Matrix: microsoft Headline
+---------+---+----+----+---+---+---------+----+-------+---------+---+-----+----+---------+-----+---+-------+-----+--------+---+-----+---------+-------+--------+----------+----+-----+---------+----+----------+------+-------+----+--------+-----+----+----+-----+---+-----+------+----+--------+--------+----+------+--------+----+-----+----+----+----+------+---+---------+----------+-------+------+----+-------+---+----+------+---+------+---------+------+-----+--------+----+--------+-------+--------+----+-----+--------+-------+--------+--------+-----+-------+-------+------+----+----------+----+-----+---+------+---+---+-----+-----+-------+----+-------+-----+-----+----+----+-----+
| col__col| 10|2015|2016|365|  4|according|also|android|announced|app|apple|apps|available|azure|big|billion|build|business|ceo|cloud|companies|company|companys|conference|corp|could|customers|data|developers|device|devices|edge|features|first|free|game|games|get|giant|google|help|hololens|in

                                                                                

Matrix: obama Title
+-----------+----+------+-------+--------------+-------+---------+------+-------+----+------+----+------+------+-----+--------+------+-------+-------+--------+-------+-----+-----+----+----+----+------+---------+-----+-----+-----+---+---+----+----+-------+---------+--------+-----+-----------+----+----+-------+----+---+-------+------+----+-----+----+--------+--------+---+----+-------+-------+-----+------+---+-------+-----+----+----+-----+------+---------+----------+-----------+-------+----+--------+------+--------+----+-----+------+-----+------+-------+-----+---+-----+------+-----+----+-------+-----+------+-----+-----+---+-----+-----+---+-----+-----+----+-----+-----+----+----+
|   col__col|2016|action|address|administration|america|americans|attack|attacks|back|barack|bill|brexit|budget|calls|campaign|change|climate|clinton|congress|control|could|court|cruz|cuba|deal|donald|executive|fight|final|first|gop|gun|guns|help|hillary|hiroshima|historic|house|immigration|iran

                                                                                

Matrix: obama Headline
+-----------+----+-------+--------------+--------+---------+---------+----+------+------+--------+------+-------+-------+---+----------+--------+-----+-------+-----+----+---+----------+------+---------+-------+-----+-----+------+------+---+----+-------+-----+-------+-------+----+----+---+-------+----+----+---+-------+--------+--------+------+--------+-------+----+----+----+-------+-------+-----+------+------+---+-----+----+------+----+------+---------+---------+------------+-----+-----+----------+-----------+----+--------+------+----+--------+------+------+-----+------+------+------+-------+--------+----+----+-----+-------+---+-----+------+----+---+-----+----------+---------+----+-----+-----+-----+----+-----+
|   col__col|2016|address|administration|american|americans|announced|  ap|barack|called|campaign|change|climate|clinton|cnn|conference|congress|could|country|court|cuba|day|democratic|donald|executive|federal|final|first|former|friday|gun|help|hillary|house

                                                                                

Matrix: palestine Title
+-----------+----+----+---+-----+----+--------+----+----+----+-----+--------+----+----------+--------+---+---+----+----+---+-----+-----+--------+------+------+------+----+-----+-----+-----+----+----+----+-----+-------------+------+-------+------------------+---------------+-------+---------+------+-------+------+---+----+------+------+---+-----+------+--------+---------+---+----------+---+---------+----------+-----------+------------+----------+-----+------+---------+------------+---------+--------+------+------+-------+----+------+--------+----+------+----------+--------+-----+--------+-------+------+------+-----+-----+----+-----+-----+-----+---+---+---+----------+-----+---+-----+---+----+----+-----+-----+
|   col__col|2015|2016|  4|abbas|arab|arrested|bank|boys|call|calls|children|city|conference|conflict|day| de|dead|east| eu|first|flood|flooding|forces|france|french|gaza|greek|group|hamas|help|high|home|human|international|israel|israeli|israelipalestinian|i

                                                                                

Matrix: palestine Headline
+------------+----+----+-----+---------+-------+------+----+---------+----+------+------+----+--------+---+---+----------+----+-----+------+-------+------+----+-------+----------+-----+-------------+-------+------+-------+------------------+---------------+-------+-------+---------+------+------+----+----+----------+-------+---+-----+---+-----+-------+------+-------+--------+------+-------+----+---+----+----------+--------+---+------------+---------+----------+-----------+------------+----+-----+------+------+---------+-----+--------+------+----+--------+----+------+--------+-------+-----+----------+-----+--------+------+-------+-----+-----+--------+----+-----+-------+---+---+------+----------+---+-----+----+---------+----+----+-----+----+-----+
|    col__col|2015|2016|abbas|according|affairs|agency|arab|authority|bank|called|center|city|conflict|day| de|department|east|first|forces|foreign|friday|gaza|general|government|group|international|islamic|israel|isr