In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 1.3 MB/s eta 0:00:01
[?25hCollecting regex
  Downloading regex-2021.8.28-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (759 kB)
[K     |████████████████████████████████| 759 kB 13.0 MB/s eta 0:00:01
Installing collected packages: regex, nltk
Successfully installed nltk-3.6.2 regex-2021.8.28


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
       .builder\
       .appName("test")\
       .getOrCreate()

21/09/09 07:21:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
df = spark.read.json("hdfs://namenode:8020/tmp/data/covid")

                                                                                

In [3]:
df.show()

+--------------------+
|               value|
+--------------------+
|{"userID": 262576...|
|{"userID": 136715...|
|{"userID": 291569...|
|{"userID": 801579...|
|{"userID": 621779...|
|{"userID": 611532...|
|{"userID": 125575...|
|{"userID": 263616...|
|{"userID": 134378...|
|{"userID": 228417...|
|{"userID": 295693...|
|{"userID": 101110...|
|{"userID": 136117...|
|{"userID": 868380...|
|{"userID": 240252...|
|{"userID": 952247...|
|{"userID": 487300...|
|{"userID": 847922...|
|{"userID": 121663...|
|{"userID": 124078...|
+--------------------+
only showing top 20 rows



In [4]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

schema= df.select(F.schema_of_json("""{
   "userID": 724245818299969500,
   "tweetText": "Text",
   "hashTags": [
      "propaganda",
      "China",
      "ChinaLiedPeopleDied",
      "COVID",
      "ChinaVirus"
   ],
   "location_full_name": "Merica",
   "favoriteCount": 0,
   "reTweetCount": 0,
   "created_at": "Fri Aug 27 03:21:20 +0000 2021"
}""")).collect()[0][0]

In [5]:
schema

'STRUCT<`created_at`: STRING, `favoriteCount`: BIGINT, `hashTags`: ARRAY<STRING>, `location_full_name`: STRING, `reTweetCount`: BIGINT, `tweetText`: STRING, `userID`: BIGINT>'

In [6]:
df = df.withColumn("value", F.from_json("value",schema))\
.select("value.userID", "value.tweetText", "value.hashTags", "value.location_full_name",
        "value.favoriteCount", "value.reTweetCount", "value.created_at",)

In [8]:
df.show()

+-------------------+--------------------+--------------------+--------------------+-------------+------------+--------------------+
|             userID|           tweetText|            hashTags|  location_full_name|favoriteCount|reTweetCount|          created_at|
+-------------------+--------------------+--------------------+--------------------+-------------+------------+--------------------+
|1011102199756148737|RT @XRPisOurFutur...|           [COVID19]|              Canada|            0|           0|Thu Sep 09 04:05:...|
|1361179981628526602|RT @FreedomIsrael...|           [COVID19]|             Romania|            0|           0|Thu Sep 09 04:05:...|
| 868380317039591425|Wednesday 8/9/21 ...|[CoWIN_Dashboard,...|   Deccans of India |            0|           0|Thu Sep 09 04:05:...|
|           24025273|#Hospitals are no...|[Hospitals, COVID...|St. Pete, FL, #Ra...|            0|           0|Thu Sep 09 04:05:...|
| 952247125185720320|RT @LindaOsborne6...|[Hospitals, COVID19]|      

# Country statistic of covid tweet

In [7]:
df.groupBy('location_full_name').count().orderBy('count', ascending=False).show()



+--------------------+-----+
|  location_full_name|count|
+--------------------+-----+
| Melbourne, Victoria|    3|
|           Australia|    2|
|               India|    2|
|        Delhi, India|    2|
|       Paris, France|    2|
|              Africa|    2|
|Johannesburg, Sou...|    2|
|        ringwood, uk|    1|
|Women's & Gay's E...|    1|
|             Wantage|    1|
|              Europe|    1|
|Far East. HN in A...|    1|
|              Ottawa|    1|
|  Madurai, tamilnadu|    1|
|           Sri Lanka|    1|
|              Berlin|    1|
|Florence, Oregon ...|    1|
|      Hanoi, Vietnam|    1|
| 🇨🇦Ⓑ.Ⓒ. ⒸⒶⓃⒶⒹⒶ🇨🇦|    1|
|            Hartwell|    1|
+--------------------+-----+
only showing top 20 rows





# Hashtags statistic 

In [8]:
df.select(F.explode(df.hashTags).alias('tag'))\
    .groupBy(F.col('tag').alias('key'))\
    .count()\
    .orderBy('count', ascending=False)\
    .show()



+--------------------+-----+
|                 key|count|
+--------------------+-----+
|             COVID19|   31|
|         coronavirus|    8|
|             Covid19|    5|
|africansarenotlab...|    4|
|             covid19|    4|
|               covid|    4|
|               India|    4|
|         Coronavirus|    4|
|artificialintelli...|    3|
|               COVID|    3|
|             vaccine|    3|
|          COVID19Aus|    3|
|             bigdata|    2|
|           Hospitals|    2|
|               Covid|    2|
|          CovidIndia|    2|
|              auspol|    2|
|             standby|    2|
|NoToCoronaVirusVa...|    2|
|               Thane|    1|
+--------------------+-----+
only showing top 20 rows





# Clean text and remove stopwords

In [9]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from nltk.stem.snowball import SnowballStemmer

In [10]:
# Clean and remove hashtag
df_clean = df.select('tweetText', (F.lower(F.regexp_replace('tweetText', "@[A-Za-z0-9_]+", "")).alias('text')))

# Tokenize text
tokenizer = Tokenizer(inputCol='tweetText', outputCol='words_token')
df_words_token = tokenizer.transform(df_clean).select('words_token')

# Remove stop words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
df_words_no_stopw = remover.transform(df_words_token).select('words_clean')

# Stem text
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_words_no_stopw.withColumn("words_stemmed", stemmer_udf("words_clean")).select('words_stemmed')

# Filter length word > 3
filter_length_udf = F.udf(lambda row: [x for x in row if len(x) >= 3], ArrayType(StringType()))
df_final_words = df_stemmed.withColumn('words', filter_length_udf(F.col('words_stemmed')))

In [11]:
df_clean.show()

+--------------------+--------------------+
|           tweetText|                text|
+--------------------+--------------------+
|RT @DrEricDing: W...|rt : wow—“every k...|
|We've crossed the...|we've crossed the...|
|⭕️ #Georgia repor...|⭕️ #georgia repor...|
|https://t.co/wQoL...|https://t.co/wqol...|
|RT @NewIndianXpre...|rt : a new sublin...|
|We urge all #work...|we urge all #work...|
|Islamabad: Daily ...|islamabad: daily ...|
|RT @CrabbBrendan:...|rt : brilliant pi...|
|RT @MissStixy: #J...|rt : #janenehosko...|
|I’ve reached that...|i’ve reached that...|
|#Coronavirus: #Th...|#coronavirus: #th...|
|RT @XRPisOurFutur...|rt : "the #covid1...|
|RT @FreedomIsrael...|rt : this is how ...|
|Wednesday 8/9/21 ...|wednesday 8/9/21 ...|
|#Hospitals are no...|#hospitals are no...|
|RT @LindaOsborne6...|rt : #hospitals a...|
|RT @DataDrivenMD:...|rt : more than a ...|
|RT @SatyendarJain...|rt : delhi govt h...|
|RT @rameshlaus: N...|rt : new #covid c...|
|RT @kr3at: 🚨REPO...|rt : 🚨repor

In [12]:
df_final_words.show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|       words_stemmed|               words|
+--------------------+--------------------+
|[rt, @drericding:...|[@drericding:, wo...|
|[cross, 14, milli...|[cross, million, ...|
|[⭕️, #georgia, re...|[#georgia, report...|
|[https://t.co/wqo...|[https://t.co/wqo...|
|[rt, @newindianxp...|[@newindianxpress...|
|[urg, #worker, fa...|[urg, #worker, fa...|
|[islamabad:, dail...|[islamabad:, dail...|
|[rt, @crabbbrenda...|[@crabbbrendan:, ...|
|[rt, @missstixy:,...|[@missstixy:, #ja...|
|[i'v, reach, poin...|[i'v, reach, poin...|
|[#coronavirus:, #...|[#coronavirus:, #...|
|[rt, @xrpisourfut...|[@xrpisourfuture:...|
|[rt, @freedomisra...|[@freedomisrael_:...|
|[wednesday, 8/9/2...|[wednesday, 8/9/2...|
|[#hospit, forc, r...|[#hospit, forc, r...|
|[rt, @lindaosborn...|[@lindaosborne60:...|
|[rt, @datadrivenm...|[@datadrivenmd:, ...|
|[rt, @satyendarja...|[@satyendarjain:,...|
|[rt, @rameshlaus:...|[@rameshlaus:, ne...|
|[rt, @kr3at:, 🚨r...|[@kr3at:, 🚨

                                                                                

# Statistic top words

In [13]:
df_final_words.select(F.explode(df_final_words.words).alias('words'))\
    .groupBy(F.col('words').alias('key'))\
    .count()\
    .orderBy('count', ascending=False)\
    .show()



+--------------------+-----+
|                 key|count|
+--------------------+-----+
|            #covid19|   33|
|               &amp;|   13|
|                 new|   10|
|              vaccin|   10|
|        #coronavirus|    9|
|              43,263|    8|
|                last|    8|
|              #covid|    7|
|              report|    6|
|               death|    6|
|               reach|    5|
|                case|    5|
|             #vaccin|    5|
|              cases,|    5|
|#africansarenotla...|    4|
|              #india|    4|
|               total|    4|
|             african|    4|
|                live|    4|
|              infect|    4|
+--------------------+-----+
only showing top 20 rows



                                                                                