In [1]:
!pip install nltk



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession\
       .builder\
       .appName("test")\
       .getOrCreate()

21/08/27 03:27:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.json("hdfs://namenode:8020/tmp/data/covid")

                                                                                

In [4]:
df.show()

+--------------------+
|               value|
+--------------------+
|{"userID": 724245...|
|{"userID": 838192...|
|{"userID": 985121...|
|{"userID": 852189...|
|{"userID": 119300...|
|{"userID": 134640...|
|{"userID": 154605...|
|{"userID": 824736...|
|{"userID": 171610...|
|{"userID": 262104...|
|{"userID": 657673...|
|{"userID": 901077...|
|{"userID": 852199...|
|{"userID": 132245...|
|{"userID": 556794...|
|{"userID": 805462...|
|{"userID": 603528...|
|{"userID": 391598...|
|{"userID": 132735...|
|{"userID": 314405...|
+--------------------+
only showing top 20 rows



In [5]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

schema= df.select(F.schema_of_json("""{
   "userID": 724245818299969500,
   "tweetText": "Text",
   "hashTags": [
      "propaganda",
      "China",
      "ChinaLiedPeopleDied",
      "COVID",
      "ChinaVirus"
   ],
   "location_full_name": "Merica",
   "favoriteCount": 0,
   "reTweetCount": 0,
   "created_at": "Fri Aug 27 03:21:20 +0000 2021"
}""")).collect()[0][0]

In [6]:
schema

'STRUCT<`created_at`: STRING, `favoriteCount`: BIGINT, `hashTags`: ARRAY<STRING>, `location_full_name`: STRING, `reTweetCount`: BIGINT, `tweetText`: STRING, `userID`: BIGINT>'

In [7]:
df = df.withColumn("value", F.from_json("value",schema))\
.select("value.userID", "value.tweetText", "value.hashTags", "value.location_full_name",
        "value.favoriteCount", "value.reTweetCount", "value.created_at",)

In [8]:
df.show()

+-------------------+--------------------+--------------------+--------------------+-------------+------------+--------------------+
|             userID|           tweetText|            hashTags|  location_full_name|favoriteCount|reTweetCount|          created_at|
+-------------------+--------------------+--------------------+--------------------+-------------+------------+--------------------+
| 724245818299969537|Claiming that's i...|[propaganda, Chin...|             'Merica|            0|           0|Fri Aug 27 03:21:...|
| 838192517791174657|RT @AntonyRobart:...|[VaccinePassports...|              Canada|            0|           0|Fri Aug 27 03:21:...|
| 985121348690366464|RT @EuroELSO: Lat...|     [ECMO, COVID19]|        L'Hospitalet|            0|           0|Fri Aug 27 03:21:...|
|           85218923|#Most worrisome i...|[Most, SouthKorea...|Jeju Island, Sout...|            0|           0|Fri Aug 27 03:21:...|
|         1193007020|RT @DrAmbrishMith...|[VaccinationDrive...|      

# Country statistic of covid tweet

In [9]:
df.groupBy('location_full_name').count().orderBy('count', ascending=False).show()



+--------------------+-----+
|  location_full_name|count|
+--------------------+-----+
|               India|   10|
|       United States|    8|
|     Los Angeles, CA|    5|
|              Canada|    5|
| Waikato New Zealand|    4|
|      United Kingdom|    2|
|         Chicago, IL|    2|
|    Toronto, Ontario|    2|
|    Edmonton Alberta|    2|
|          Dallas, TX|    2|
|Tondiarpet, VadaC...|    2|
|            New York|    2|
|    New Delhi, India|    2|
|Jeju Island, Sout...|    2|
|     Washington, USA|    2|
|     California, USA|    2|
|            Malaysia|    2|
|           Australia|    2|
|Auckland, New Zea...|    2|
|     Toronto, Canada|    2|
+--------------------+-----+
only showing top 20 rows





# Hashtags statistic 

In [10]:
df.select(F.explode(df.hashTags).alias('tag'))\
    .groupBy(F.col('tag').alias('key'))\
    .count()\
    .orderBy('count', ascending=False)\
    .show()



+--------------------+-----+
|                 key|count|
+--------------------+-----+
|             COVID19|  149|
|               COVID|   20|
|             Covid19|   16|
|             covid19|   15|
|               Spain|    6|
|         coronavirus|    6|
|    VaccinePassports|    6|
|               Covid|    6|
|             vaccine|    4|
|               covid|    4|
|           coviddata|    3|
|            covid_19|    3|
|        smcanalytics|    3|
|covidvislualizations|    3|
|         Afghanistan|    3|
|              Odisha|    3|
|           WearAMask|    3|
|                 USA|    3|
|      MedicareForAll|    2|
|              Kerala|    2|
+--------------------+-----+
only showing top 20 rows



                                                                                

# Clean text and remove stopwords

In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

In [12]:
# Clean and remove hashtag
df_clean = df.select('tweetText', (F.lower(F.regexp_replace('tweetText', "@[A-Za-z0-9_]+", "")).alias('text')))

# Tokenize text
tokenizer = Tokenizer(inputCol='tweetText', outputCol='words_token')
df_words_token = tokenizer.transform(df_clean).select('words_token')

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_words_no_stopw = remover.transform(df_words_token).select('words_clean')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_words_no_stopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('words_stemmed')

# Filter length word > 3
filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], ArrayType(StringType()))
df_final_words = df_stemmed.withColumn('words', filter_length_udf(F.col('words_stemmed')))

In [13]:
df_clean.show()

+--------------------+--------------------+
|           tweetText|                text|
+--------------------+--------------------+
|Claiming that's i...|claiming that's i...|
|RT @AntonyRobart:...|rt : the debate o...|
|RT @EuroELSO: Lat...|rt : latest updat...|
|#Most worrisome i...|#most worrisome i...|
|RT @DrAmbrishMith...|rt : start the da...|
|@KavalAuthorActs ...| take ivermectin ...|
|RT @SantaKlauSchw...|rt : a lot of goo...|
|RT @SamirShahMD: ...|rt : 💸😟financia...|
|RT @mustafahirji:...|rt : but if cases...|
|@samanthamaiden @...|    sydney centri...|
|RT @AntonyRobart:...|rt : the debate o...|
|@Debceecee @NSWHe...|  journalists att...|
|RT @JulesBoykoff:...|rt : ❗️198 people...|
|RT @SarfarazTooba...|rt : #delaymdcat2...|
|@RepJayapal @USPr...|      i can think...|
|Thanks to Mo @CMO...|thanks to mo  for...|
|RT @Lasterbosire:...|rt : take note,  ...|
|The headline of t...|the headline of t...|
|@WHO  care about ...|  care about smal...|
|RT @portarican_RT...|rt : you bla

In [14]:
df_final_words.show()

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------+--------------------+
|       words_stemmed|               words|
+--------------------+--------------------+
|[claim, western, ...|[claim, western, ...|
|[rt, @antonyrobar...|[@antonyrobart:, ...|
|[rt, @euroelso:, ...|[@euroelso:, late...|
|[#most, worrisom,...|[#most, worrisom,...|
|[rt, @drambrishmi...|[@drambrishmithal...|
|[@kavalauthoract,...|[@kavalauthoract,...|
|[rt, @santaklausc...|[@santaklauschwab...|
|[rt, @samirshahmd...|[@samirshahmd:, ?...|
|[rt, @mustafahirj...|[@mustafahirji:, ...|
|[@samanthamaiden,...|[@samanthamaiden,...|
|[rt, @antonyrobar...|[@antonyrobart:, ...|
|[@debceece, @nswh...|[@debceece, @nswh...|
|[rt, @julesboykof...|[@julesboykoff:, ...|
|[rt, @sarfaraztoo...|[@sarfaraztooba1:...|
|[@repjayap, @uspr...|[@repjayap, @uspr...|
|[thank, mo, @cmo_...|[thank, @cmo_odis...|
|[rt, @lasterbosir...|[@lasterbosire:, ...|
|[headlin, new, ca...|[headlin, new, ca...|
|[@who, , care, sm...|[@who, care, smal...|
|[rt, @portarican_...|[@portaric

                                                                                

# Statistic top words

In [15]:
df_final_words.select(F.explode(df_final_words.words).alias('words'))\
    .groupBy(F.col('words').alias('key'))\
    .count()\
    .orderBy('count', ascending=False)\
    .show()



+---------------+-----+
|            key|count|
+---------------+-----+
|       #covid19|  166|
|          death|   58|
|          blame|   45|
|         vaccin|   37|
|          &amp;|   31|
|          state|   28|
|            new|   27|
|         #covid|   23|
|        control|   23|
|           the…|   23|
|       governor|   23|
|            red|   22|
|           case|   22|
|          away,|   22|
|          biden|   22|
|@portarican_rt:|   22|
|           mile|   22|
|           8000|   22|
|         pandem|   15|
|          daili|   15|
+---------------+-----+
only showing top 20 rows



