# LOAD LIBRARY

In [1]:
!pip install swifter



In [0]:
import json
import time
import os
import re
import swifter
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm.autonotebook import tqdm

pd.set_option('display.max_colwidth', -1)
pd.options.display.max_columns = None


In [3]:
!ls -al

total 16
drwxr-xr-x 1 root root 4096 Mar  3 18:11 .
drwxr-xr-x 1 root root 4096 Mar 14 09:34 ..
drwxr-xr-x 1 root root 4096 Mar  3 18:11 .config
drwxr-xr-x 1 root root 4096 Mar  3 18:11 sample_data


# LOAD DATA

In [4]:
df = pd.read_csv('TwitterData.csv')
df = df[df['lang'] == 'en']

print('Data count :', len(df))
display(df.head(3))

Data count : 203812


Unnamed: 0,created_at,twitter_id,text,user_id,user_name,screen_name,followers_count,friends_count,profile_image,retweet_count,favorite_count,lang,latitude,longitude,place_type,place_name,place_full_name,country_code,country,disaster_type,date
1,2019-08-01 09:04:50,1156853283950141444,"RT @lj1105: I’m getting static on my posting of the Bible stance on Homosexuality. Make no mistake,,Homosexuals will NOT inherit the Kingdom. They WILL burn forever in the Lake Of Fire. They CAN ,however,be forgiven and have their sins blotted out. It’s your choice , life or death ?",742203452,@Ray mond,realraymondp,506,676,http://pbs.twimg.com/profile_images/1147117231404457990/iuAjZLdb_normal.jpg,2,0,en,,,,,,,,fire,2019-08-01
2,2019-08-01 09:04:50,1156853282012356608,"a night in my life consists of\n1. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is BROKEN i guess\n2. me waking up at 3am on fire, because my dog has decided to snuggle in so now i have a duvet, 2 wool blankets, and a HEATER",2161529938,jenny 🌻,zebraflavoured,57,53,http://pbs.twimg.com/profile_images/1135299216052445184/74Yi0d2Y_normal.png,0,0,en,,,,,,,,fire,2019-08-01
3,2019-08-01 09:04:50,1156853281915908096,RT @NWSTopeka: More rain tonight and Friday. Additional rainfall of 2 to 4 inches is possible. Be ready to take action if in a flood prone area. #kswx https://t.co/WQiwaLuRDz,817325871560347649,KS Weather Updates,weather_ks,322,290,http://pbs.twimg.com/profile_images/817330554853085184/N1CYTsHw_normal.jpg,1,0,en,,,,,,,,flood,2019-08-01


# PREPROCESS

## CORRECT DATA TYPE

In [0]:
df['created_at'] = df['created_at'].astype('datetime64[ns]')
df['date'] = df['date'].astype('datetime64[ns]').dt.date

In [6]:
df.dtypes

created_at         datetime64[ns]
twitter_id         int64         
text               object        
user_id            int64         
user_name          object        
screen_name        object        
followers_count    int64         
friends_count      int64         
profile_image      object        
retweet_count      int64         
favorite_count     int64         
lang               object        
latitude           float64       
longitude          float64       
place_type         object        
place_name         object        
place_full_name    object        
country_code       object        
country            object        
disaster_type      object        
date               object        
dtype: object

## CLEAN TEXT

In [0]:
def remove_url(text):
    urlPattern = "((https?|ftp|gopher|telnet|file|Unsure|http):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)"
    text = re.sub(urlPattern, '', text)
    return text

def remove_rt(text):
    text = re.sub('^rt @[\\w]*: ', '', text).strip()
    return text

def remove_at(text):
    text = re.sub('@[\\w]*', '', text).strip()
    return text

def clean_text(text):
    text = text.lower().replace('\n', '').replace('\t', '')
    text = remove_url(text)
    text = remove_rt(text)
    text = remove_at(text)
    text = (text
            .replace(':', ' ')
            .replace(',', ' ')
            .replace('!', ' ')
            .replace('#', ' ')
            .replace('(', ' ')
            .replace(')', ' ')
            .replace('"', ' ')
            .replace("'", ' ')
            .replace('?', ' ')
            .replace('”', ' ')
            .replace("’", ' ')
           )
    text = re.sub(' +', ' ', text)
    text = re.sub('\.+', '\.', text)
    text = text.strip()
    
    return text

In [8]:
df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].head(3)

Unnamed: 0,text,clean_text
1,"RT @lj1105: I’m getting static on my posting of the Bible stance on Homosexuality. Make no mistake,,Homosexuals will NOT inherit the Kingdom. They WILL burn forever in the Lake Of Fire. They CAN ,however,be forgiven and have their sins blotted out. It’s your choice , life or death ?",i m getting static on my posting of the bible stance on homosexuality\. make no mistake homosexuals will not inherit the kingdom\. they will burn forever in the lake of fire\. they can however be forgiven and have their sins blotted out\. it s your choice life or death
2,"a night in my life consists of\n1. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is BROKEN i guess\n2. me waking up at 3am on fire, because my dog has decided to snuggle in so now i have a duvet, 2 wool blankets, and a HEATER",a night in my life consists of1\. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is broken i guess2\. me waking up at 3am on fire because my dog has decided to snuggle in so now i have a duvet 2 wool blankets and a heater
3,RT @NWSTopeka: More rain tonight and Friday. Additional rainfall of 2 to 4 inches is possible. Be ready to take action if in a flood prone area. #kswx https://t.co/WQiwaLuRDz,more rain tonight and friday\. additional rainfall of 2 to 4 inches is possible\. be ready to take action if in a flood prone area\. kswx


## TIME TO MINUTE

In [0]:
unix_epoch = np.datetime64(0, 's')
one_second = np.timedelta64(1, 's')

def minute(dt64):
    seconds_since_epoch = (dt64 - unix_epoch) / one_second
    dt = datetime.utcfromtimestamp(seconds_since_epoch)
    dt = dt.replace(second=0, microsecond=0)
    return dt

In [10]:
df['created_at'] = df['created_at'].apply(lambda x : minute(x))
df.head(3)

Unnamed: 0,created_at,twitter_id,text,user_id,user_name,screen_name,followers_count,friends_count,profile_image,retweet_count,favorite_count,lang,latitude,longitude,place_type,place_name,place_full_name,country_code,country,disaster_type,date,clean_text
1,2019-08-01 09:04:00,1156853283950141444,"RT @lj1105: I’m getting static on my posting of the Bible stance on Homosexuality. Make no mistake,,Homosexuals will NOT inherit the Kingdom. They WILL burn forever in the Lake Of Fire. They CAN ,however,be forgiven and have their sins blotted out. It’s your choice , life or death ?",742203452,@Ray mond,realraymondp,506,676,http://pbs.twimg.com/profile_images/1147117231404457990/iuAjZLdb_normal.jpg,2,0,en,,,,,,,,fire,2019-08-01,i m getting static on my posting of the bible stance on homosexuality\. make no mistake homosexuals will not inherit the kingdom\. they will burn forever in the lake of fire\. they can however be forgiven and have their sins blotted out\. it s your choice life or death
2,2019-08-01 09:04:00,1156853282012356608,"a night in my life consists of\n1. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is BROKEN i guess\n2. me waking up at 3am on fire, because my dog has decided to snuggle in so now i have a duvet, 2 wool blankets, and a HEATER",2161529938,jenny 🌻,zebraflavoured,57,53,http://pbs.twimg.com/profile_images/1135299216052445184/74Yi0d2Y_normal.png,0,0,en,,,,,,,,fire,2019-08-01,a night in my life consists of1\. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is broken i guess2\. me waking up at 3am on fire because my dog has decided to snuggle in so now i have a duvet 2 wool blankets and a heater
3,2019-08-01 09:04:00,1156853281915908096,RT @NWSTopeka: More rain tonight and Friday. Additional rainfall of 2 to 4 inches is possible. Be ready to take action if in a flood prone area. #kswx https://t.co/WQiwaLuRDz,817325871560347649,KS Weather Updates,weather_ks,322,290,http://pbs.twimg.com/profile_images/817330554853085184/N1CYTsHw_normal.jpg,1,0,en,,,,,,,,flood,2019-08-01,more rain tonight and friday\. additional rainfall of 2 to 4 inches is possible\. be ready to take action if in a flood prone area\. kswx


## TOKENIZE

In [11]:
df['tokens'] = df['clean_text'].str.split(' ') 
df[['text', 'clean_text', 'tokens']].head(3)

Unnamed: 0,text,clean_text,tokens
1,"RT @lj1105: I’m getting static on my posting of the Bible stance on Homosexuality. Make no mistake,,Homosexuals will NOT inherit the Kingdom. They WILL burn forever in the Lake Of Fire. They CAN ,however,be forgiven and have their sins blotted out. It’s your choice , life or death ?",i m getting static on my posting of the bible stance on homosexuality\. make no mistake homosexuals will not inherit the kingdom\. they will burn forever in the lake of fire\. they can however be forgiven and have their sins blotted out\. it s your choice life or death,"[i, m, getting, static, on, my, posting, of, the, bible, stance, on, homosexuality\., make, no, mistake, homosexuals, will, not, inherit, the, kingdom\., they, will, burn, forever, in, the, lake, of, fire\., they, can, however, be, forgiven, and, have, their, sins, blotted, out\., it, s, your, choice, life, or, death]"
2,"a night in my life consists of\n1. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is BROKEN i guess\n2. me waking up at 3am on fire, because my dog has decided to snuggle in so now i have a duvet, 2 wool blankets, and a HEATER",a night in my life consists of1\. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is broken i guess2\. me waking up at 3am on fire because my dog has decided to snuggle in so now i have a duvet 2 wool blankets and a heater,"[a, night, in, my, life, consists, of1\., me, sleeping, under, a, duvet, and, two, thick, wool, blankets, and, still, being, cold, bc, my, inner, heating, system, is, broken, i, guess2\., me, waking, up, at, 3am, on, fire, because, my, dog, has, decided, to, snuggle, in, so, now, i, have, a, duvet, 2, wool, blankets, and, a, heater]"
3,RT @NWSTopeka: More rain tonight and Friday. Additional rainfall of 2 to 4 inches is possible. Be ready to take action if in a flood prone area. #kswx https://t.co/WQiwaLuRDz,more rain tonight and friday\. additional rainfall of 2 to 4 inches is possible\. be ready to take action if in a flood prone area\. kswx,"[more, rain, tonight, and, friday\., additional, rainfall, of, 2, to, 4, inches, is, possible\., be, ready, to, take, action, if, in, a, flood, prone, area\., kswx]"


## REMOVE STOP WORD

In [12]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
# Load stop words
stop_words = stopwords.words('english')

In [0]:
df['clean_tokens'] = df['tokens'].apply(lambda tokens : [word for word in tokens if word not in stop_words and len(word) > 3])

In [16]:
df.head(3)

Unnamed: 0,created_at,twitter_id,text,user_id,user_name,screen_name,followers_count,friends_count,profile_image,retweet_count,favorite_count,lang,latitude,longitude,place_type,place_name,place_full_name,country_code,country,disaster_type,date,clean_text,tokens,clean_tokens
1,2019-08-01 09:04:00,1156853283950141444,"RT @lj1105: I’m getting static on my posting of the Bible stance on Homosexuality. Make no mistake,,Homosexuals will NOT inherit the Kingdom. They WILL burn forever in the Lake Of Fire. They CAN ,however,be forgiven and have their sins blotted out. It’s your choice , life or death ?",742203452,@Ray mond,realraymondp,506,676,http://pbs.twimg.com/profile_images/1147117231404457990/iuAjZLdb_normal.jpg,2,0,en,,,,,,,,fire,2019-08-01,i m getting static on my posting of the bible stance on homosexuality\. make no mistake homosexuals will not inherit the kingdom\. they will burn forever in the lake of fire\. they can however be forgiven and have their sins blotted out\. it s your choice life or death,"[i, m, getting, static, on, my, posting, of, the, bible, stance, on, homosexuality\., make, no, mistake, homosexuals, will, not, inherit, the, kingdom\., they, will, burn, forever, in, the, lake, of, fire\., they, can, however, be, forgiven, and, have, their, sins, blotted, out\., it, s, your, choice, life, or, death]","[getting, static, posting, bible, stance, homosexuality\., make, mistake, homosexuals, inherit, kingdom\., burn, forever, lake, fire\., however, forgiven, sins, blotted, out\., choice, life, death]"
2,2019-08-01 09:04:00,1156853282012356608,"a night in my life consists of\n1. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is BROKEN i guess\n2. me waking up at 3am on fire, because my dog has decided to snuggle in so now i have a duvet, 2 wool blankets, and a HEATER",2161529938,jenny 🌻,zebraflavoured,57,53,http://pbs.twimg.com/profile_images/1135299216052445184/74Yi0d2Y_normal.png,0,0,en,,,,,,,,fire,2019-08-01,a night in my life consists of1\. me sleeping under a duvet and two thick wool blankets and still being cold bc my inner heating system is broken i guess2\. me waking up at 3am on fire because my dog has decided to snuggle in so now i have a duvet 2 wool blankets and a heater,"[a, night, in, my, life, consists, of1\., me, sleeping, under, a, duvet, and, two, thick, wool, blankets, and, still, being, cold, bc, my, inner, heating, system, is, broken, i, guess2\., me, waking, up, at, 3am, on, fire, because, my, dog, has, decided, to, snuggle, in, so, now, i, have, a, duvet, 2, wool, blankets, and, a, heater]","[night, life, consists, of1\., sleeping, duvet, thick, wool, blankets, still, cold, inner, heating, system, broken, guess2\., waking, fire, decided, snuggle, duvet, wool, blankets, heater]"
3,2019-08-01 09:04:00,1156853281915908096,RT @NWSTopeka: More rain tonight and Friday. Additional rainfall of 2 to 4 inches is possible. Be ready to take action if in a flood prone area. #kswx https://t.co/WQiwaLuRDz,817325871560347649,KS Weather Updates,weather_ks,322,290,http://pbs.twimg.com/profile_images/817330554853085184/N1CYTsHw_normal.jpg,1,0,en,,,,,,,,flood,2019-08-01,more rain tonight and friday\. additional rainfall of 2 to 4 inches is possible\. be ready to take action if in a flood prone area\. kswx,"[more, rain, tonight, and, friday\., additional, rainfall, of, 2, to, 4, inches, is, possible\., be, ready, to, take, action, if, in, a, flood, prone, area\., kswx]","[rain, tonight, friday\., additional, rainfall, inches, possible\., ready, take, action, flood, prone, area\., kswx]"


# ANALYZE

In [17]:
!ls

sample_data


In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

## WORD COUNT

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [0]:
spark = (SparkSession
         .builder
         .master("local")
         .appName("AnalyzeTwitterData")
         .getOrCreate()
        )

In [22]:
spark

In [0]:
DF = spark.createDataFrame(df[['created_at','date','clean_tokens','disaster_type']])

In [0]:
#word count output

wordcount_DF = (DF
                .withColumn("word", explode(col("clean_tokens")))
                .groupBy('word')
                .count()
                .filter(col('count') > 20)
                .sort(asc('word'))
               )
wordcount_df = wordcount_DF.toPandas()

In [25]:
wordcount_df.head(3)

Unnamed: 0,word,count
0,$40k\.,23
1,&amp;,9633
2,&gt;,170


In [0]:
#co-occurence output

wordcount2_DF = (DF
                 .select('disaster_type', 'date', 'created_at', 'clean_tokens')
                 .withColumn("word", explode(col("clean_tokens")))
                 .groupBy('disaster_type', 'date', 'created_at', 'word')
                 .count()
                 .filter(col('count') > 2)
                 .sort(asc('disaster_type'), asc('date'), asc('created_at'), asc('word'))
                )
wordcount2_df = wordcount2_DF.toPandas()

In [27]:
wordcount2_df.head(3)

Unnamed: 0,disaster_type,date,created_at,word,count
0,earthquake,2019-08-01,2019-08-01 00:01:00,earthquake,4
1,earthquake,2019-08-01,2019-08-01 00:03:00,earthquake,3
2,earthquake,2019-08-01,2019-08-01 00:08:00,earthquake,4


# EXPORT

In [0]:
df.to_csv(
    'text.csv', 
    mode = 'w', 
    header = True,
    index = False
)


wordcount2_df.to_csv(
    'word_statistic2.csv', 
    mode = 'w', 
    header = True,
    index = False
)