In [203]:
# Import library
import pandas as pd
import numpy as np
import json
import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

In [21]:
# Load json data from file
data = []
with open('dataset/sgcircuitbreaker.json', encoding="utf8") as f:
    for line in f:
        data.append(json.loads(line))

In [163]:
# Import json data into dataframe
df = pd.DataFrame(data)
df.head()

Unnamed: 0,_id,created_at,id,screen_name:,lang,is_quote_status,is_retweet_status,full_text,quote_text
0,{'$oid': '5eae613c65da26ed42446310'},{'$date': {'$numberLong': '1588486453000'}},{'$numberLong': '1256829397778132992'},Ashy_roz,en,True,False,Morning Peepos @MdAnde1 @SyiqinKmz,Good morning...\n#Singapore \n#circuitbreaker ...
1,{'$oid': '5eae614b65da26ed42446311'},{'$date': {'$numberLong': '1588486468000'}},{'$numberLong': '1256829460193689601'},harshdass11,en,False,False,"There is proof in📗 quraan sharif, who has comp...",
2,{'$oid': '5eae616665da26ed42446312'},{'$date': {'$numberLong': '1588486496000'}},{'$numberLong': '1256829574723383296'},harshdass11,en,False,True,Is creator🤴 of all the universe🌍\nAlmighty God...,
3,{'$oid': '5eae616a65da26ed42446313'},{'$date': {'$numberLong': '1588486500000'}},{'$numberLong': '1256829592066801665'},staciechan,en,False,False,Day 26 #Covid19 #circuitbreakersg: unleashing ...,
4,{'$oid': '5eae618465da26ed42446314'},{'$date': {'$numberLong': '1588486525000'}},{'$numberLong': '1256829698048274434'},SStretchsg,en,False,False,#𝘿𝙚𝙡𝙞𝙫𝙚𝙧𝙮𝙍𝙞𝙙𝙚𝙧𝙨 are your shoulder stiff from a...,


### Things to clean:
- Checking any missing/ duplicated value
- Extract data from json object inside cell for columns `_id`, `created_at`, `id`
- Rename columns

### Text cleaning:
- For `full_text`, `quote_text` column, text preprocessing is required for sentiment analysis
- Convert to lowercase, remove noise and stopword, tokenization

#### Checking missing value

In [164]:
# Checking missing value
df.isnull().sum()

_id                  0
created_at           0
id                   0
screen_name:         0
lang                 0
is_quote_status      0
is_retweet_status    0
full_text            0
quote_text           0
dtype: int64

No missing value is found.

#### Extract json data

May Refer these links to see how to extract json data
- [w3school](https://www.w3schools.com/python/python_json.asp)
- [Stack Overflow](https://stackoverflow.com/questions/42354001/python-json-object-must-be-str-bytes-or-bytearray-not-dict/42354033)

In [165]:
# Extract json object from _id column
df['_id'] = df['_id'].apply(lambda x: json.loads(json.dumps(x))['$oid'])

In [166]:
# Extract json object from id column
df['id'] = df['id'].apply(lambda x: json.loads(json.dumps(x))['$numberLong'])

In [167]:
# Extract json object from id column
df['created_at'] = df['created_at'].apply(lambda x: json.loads(json.dumps(x))['$date']['$numberLong'])

In [168]:
# Check value
df.sample(10)

Unnamed: 0,_id,created_at,id,screen_name:,lang,is_quote_status,is_retweet_status,full_text,quote_text
1991,5eaf7a92b194fb305583216d,1588558476000,1257131481358454785,mimicrassy,en,False,True,What were these 7 people thinking?? #Stayhomef...,
4460,5eb0b53cb194fb3055832b12,1588639030000,1257469350069645312,Samira3787,ar,False,False,@AkasdMabdah فوغا مسويه عروض جدا رهيبه واستخدم...,
5202,5eb14ea0b194fb3055832df8,1588678298000,1257634051902418945,ChoLiu92,und,False,True,200505 #CB\n🐶🐶 https://t.co/ZK8j8fcL2U,
3743,5eb03e73b194fb3055832845,1588608620000,1257341802769797121,dassi_aarti,en,False,True,It is very rare to find 🔍an tatv darshi saint....,
666,5eae957f65da26ed424465aa,1588499833000,1256885515439104000,jessie_36milk,ja,False,True,⚽️Ｊリーグ⚽️\n\n【横浜】再開後の過密日程で存在感を発揮できるか？ ニューカマー山本義...,
6498,5eb194a7b194fb3055833308,1588696224000,1257709240241831936,Manohar37840440,en,False,True,#Say_No_To_Alcohol\nAccording to the constitut...,
1890,5eaf68eeb194fb3055832108,1588553960000,1257112541580324864,cafe_rosemary,ko,False,True,"“논문을 미리 읽은 같은 학교에 다니는 지인이 말없이 최씨를 끌어안고 “언니, 여태...",
869,5eaeb617b194fb3055831d0b,1588508176000,1256920510392201220,syfeka,in,True,True,"Sometimes, you genuinely forgot. Mak aku gi pa...",How come the mall security outside the bakery ...
3444,5eb03800b194fb305583271a,1588606969000,1257334877764149251,bhawnamadam,en,False,True,Brahma🤴 Vishnu Mahesh is not immortal. They al...,
8860,5eb40177d16fd09c375d5e5f,1588855153000,1258375836836618240,kevin_0j0,zh,False,True,#凱文的鎖屌計畫 #Day7\n\n統計結果出來啦～\n第一次鎖屌就要被鎖 #123天🔒😰\...,


#### Checking duplicated value

In [169]:
# Checking duplicated value
df.duplicated().sum()

0

No duplicate row is found.

#### Convert Unix timestamp to datetime

In [173]:
# Remove last 3 characters '000' from unix string (Should be 10 digits only)
# Convert created_at values from object to string first before it can convert to timestamp format
df['created_at'] = df['created_at'].apply(lambda x: x[:-3])
df['created_at'] = df['created_at'].astype(np.int64)

In [178]:
# Convert unix timestamp to datetime format
df['created_at'] = df['created_at'].apply(lambda x: datetime.datetime.utcfromtimestamp(x).strftime('%Y-%m-%dT%H:%M:%SZ'))

In [184]:
df['created_at'] = pd.to_datetime(df['created_at'])

#### Drop Columns
`_id` is id from mongo database, thus, it is not required in this project and could be dropped.

In [181]:
df.drop('_id', axis =1, inplace =True)

#### Rename Columns
- Change `id` to `user_id`  
- Change `screen_name:` to `username`
- Change `lang` to `language`

In [183]:
df.rename(columns = {'id':'user_id', 'screen_name:':'username', 'lang':'language'}, inplace = True)

#### Define Data Cleaning Functions
For tweet cleaning, may refer [here](https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf).

In [235]:
# Convert all characters to lowercase
def to_lowercase(text):
    text  = text.lower()
    
    return text

In [242]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [249]:
# Remove Emoji
def remove_emoji(text):
    text = emoji_pattern.sub(r'', text)
    return text

In [244]:
# Remove Mentions
def remove_mention(text):
    text = re.sub(r':', '', text)
    text = re.sub(r'‚Ä¶', '', text) 
    return text

In [245]:
# Replace consecutive non-ASCII characters with a space
def replace_nonASCII(text):
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text

In [246]:
# Remove punctuation
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [247]:
# Remove stopwords
stopwordsList = set(stopwords.words('english')) 
def remove_stopwords(text):
    text = [word for word in text if word not in stopwordsList]
    return text

In [200]:
# Tokenize words
def tokenize(text):
    word_tokens = word_tokenize(text)
    return word_tokens

In [251]:
# Combine all the functions
def datapreprocessing(review):
    
    # Convert the text into lowercase
    review = to_lowercase(review)
    
    # Clean tweet
    review = remove_emoji(review)
    review = remove_mention(review)
    review = replace_nonASCII(review)
        
    # Remove punctuation
    review = remove_punct(review)
    
    # Tokenization
    review = word_tokenize(review)
    
    # Remove stopwords
    review = remove_stopwords(review)
    
    #return review
    return " ".join(review)

- Remove Hasttag
- Remove HTML

In [253]:
cols = ['full_text', 'quote_text']

for c in cols:
    df[c] = df[c].apply(lambda x: datapreprocessing(x))

In [254]:
df

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
0,2020-05-03 06:14:13+00:00,1256829397778132992,Ashy_roz,en,True,False,morning peepos mdande syiqinkmz,good morning singapore circuitbreaker httpstco...
1,2020-05-03 06:14:28+00:00,1256829460193689601,harshdass11,en,False,False,proof quraan sharif composed shrasti six days ...,
2,2020-05-03 06:14:56+00:00,1256829574723383296,harshdass11,en,False,True,creator universe almighty god kabir informatio...,
3,2020-05-03 06:15:00+00:00,1256829592066801665,staciechan,en,False,False,day covid circuitbreakersg unleashing creative...,
4,2020-05-03 06:15:25+00:00,1256829698048274434,SStretchsg,en,False,False,shoulder stiff carrying riding delivery hands ...,
...,...,...,...,...,...,...,...,...
9467,2020-05-08 08:11:05+00:00,1258670745820880896,mariozaharah,en,True,False,hope sporemoh lawrencewongst govsingapore seei...,jjangelus homemigrantssg yes studies backed vi...
9468,2020-05-08 08:11:10+00:00,1258670767115431936,_j3zd,en,True,False,yes sir dont back idiots,cb one month already still got people dont wan...
9469,2020-05-08 08:12:39+00:00,1258671141511589888,ashley_ow,en,False,True,school holidays start youths already planning ...,
9470,2020-05-08 08:12:42+00:00,1258671152215384065,JR53719650,ja,False,True,bs box cbamprt cb httpstcovalewasmq,
