In [1]:
# Import library
import pandas as pd
import numpy as np
import pymongo
import json
import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

In [2]:
# Load json data into dataframe (read directly from Mongodb)
# Declare MONGODB Instance (localhost)
mongo_client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
mongo_db = mongo_client['myDB']
mongo_collection = mongo_db['Twitter_Data']

# Read all data into pandas dataframe
df = pd.DataFrame(list(mongo_collection.find()))

In [3]:
df.shape

(1459, 9)

In [4]:
df.head()

Unnamed: 0,_id,created_at,id,screen_name:,lang,is_quote_status,is_retweet_status,full_text,quote_text
0,5ebe3bd2403dbfa81a78369a,2020-05-15 06:50:51,1261187267852451840,FalconhunterNRA,en,False,True,ICYMI: #FLYNNATTORNEY: Entrapment Plan Orchest...,
1,5ebe3bd6403dbfa81a78369b,2020-05-15 06:50:54,1261187283530788867,RafaelGarciaLAF,es,False,True,"#Trump dice que los médicos y enfermeras ""corr...",
2,5ebe3bdc403dbfa81a78369c,2020-05-15 06:51:00,1261187309204103168,ZA1194,en,False,False,@Neganwillclocku @AngelaBelcamino @realDonaldT...,
3,5ebe3bdd403dbfa81a78369d,2020-05-15 06:51:01,1261187312005926913,gary_burch,en,False,True,"When this pandemic is all over, the four count...",
4,5ebe3bdd403dbfa81a78369e,2020-05-15 06:51:01,1261187312429314048,Praveenkumarur3,hi,False,False,500 अरब डॉलर की बचत होगी.'' ट्रंप ने चीन से सा...,


### Things to clean:
- Checking any missing/ duplicated value
- Extract data from json object inside cell for columns `_id`, `created_at`, `id`
- Rename columns

### Text cleaning:
- For `full_text`, `quote_text` column, text preprocessing is required for sentiment analysis
- Convert to lowercase, remove noise and stopword, tokenization

Before data cleaning, create another copy of dataframe.

In [5]:
df_clean = df.copy()

#### Checking missing value

In [6]:
# Checking missing value
df_clean.isnull().sum()

_id                  0
created_at           0
id                   0
screen_name:         0
lang                 0
is_quote_status      0
is_retweet_status    0
full_text            0
quote_text           0
dtype: int64

No missing value is found.

#### Checking duplicated value

In [7]:
# Checking duplicated value
df_clean.duplicated().sum()

0

No duplicate row is found.

In [8]:
# Dataset info
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   _id                1459 non-null   object        
 1   created_at         1459 non-null   datetime64[ns]
 2   id                 1459 non-null   int64         
 3   screen_name:       1459 non-null   object        
 4   lang               1459 non-null   object        
 5   is_quote_status    1459 non-null   bool          
 6   is_retweet_status  1459 non-null   bool          
 7   full_text          1459 non-null   object        
 8   quote_text         1459 non-null   object        
dtypes: bool(2), datetime64[ns](1), int64(1), object(5)
memory usage: 82.8+ KB


#### Drop Columns
`_id` is id from mongo database, thus, it is not required in this project and could be dropped.

In [9]:
df_clean.drop('_id', axis =1, inplace =True)

#### Rename Columns
- Change `id` to `user_id`  
- Change `screen_name:` to `username`
- Change `lang` to `language`

In [10]:
df_clean.rename(columns = {'id':'user_id', 'screen_name:':'username', 'lang':'language'}, inplace = True)

In [11]:
df_clean.sample(10)

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
1262,2020-05-15 07:43:24,1261200494602715136,moonbreeze2,en,False,False,"Senate Majority Leader Mitch McConnell, #Repub...",
1369,2020-05-15 07:48:46,1261201844816281600,KI4FDW,en,False,True,@danielhoffmanDC @GillianHTurner @AngelHe33778...,
1020,2020-05-15 07:33:31,1261198009166999553,fortruthssake2,en,False,True,What scares me most is when @JoeBiden crushes ...,
570,2020-05-15 07:12:07,1261192620107345922,rgoswami2326,en,False,False,The whole world is crying right now\nCan't the...,
990,2020-05-15 07:32:27,1261197738240180224,redirectloop,en,False,False,@NathanJRobinson You clearly have energy to sp...,
705,2020-05-15 07:18:06,1261194125635997696,NoNameCimBom,de,False,False,@tagesschau Und der Caesar #Trump entscheidet ...,
1200,2020-05-15 07:40:18,1261199712784351232,Itsme81977221,nl,False,True,Als #trump geld stopt voor #who gilt heel #lin...,
622,2020-05-15 07:13:57,1261193081220726785,GilbertNutier,fr,False,False,Coronavirus. Trump se dit déçu par la Chine et...,
595,2020-05-15 07:12:55,1261192822209892352,Helle_____,und,False,False,#Trump #obamagate \n\nhttps://t.co/K5RfgcBnh4,
39,2020-05-15 06:52:06,1261187586451746816,bitcoinconnect,en,False,False,Guardian Life Names Margherita L. DiManni Depu...,


#### Define Data Cleaning Functions
For tweet cleaning, may refer [here](https://towardsdatascience.com/extracting-twitter-data-pre-processing-and-sentiment-analysis-using-python-3-0-7192bd8b47cf).

In [12]:
# Convert all characters to lowercase
def to_lowercase(text):
    text  = text.lower()
    return text

In [13]:
#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

In [14]:
# Remove Emoji
def remove_emoji(text):
    text = emoji_pattern.sub(r'', text)
    return text

In [15]:
# Remove Mentions
def remove_mention(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    return text

In [16]:
# Remove Hashtag
def remove_hashtag(text):
    text = re.sub(r'#\w+', '', text)
    return text

In [17]:
# Remove URL
def remove_url(text):
    text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*', ' ', text)
    return text

In [18]:
# Replace consecutive non-ASCII characters with a space
def replace_nonASCII(text):
    text = re.sub(r'[^\x00-\x7F]+',' ', text)
    return text

In [19]:
# Remove punctuation
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [20]:
# Remove stopwords
stopwordsList = set(stopwords.words('english')) 
def remove_stopwords(text):
    text = [word for word in text if word not in stopwordsList]
    return text

In [21]:
# Tokenize words
def tokenize(text):
    word_tokens = word_tokenize(text)
    return word_tokens

In [22]:
# Combine all the functions
def datapreprocessing(review):
    
    # Convert the text into lowercase
    review = to_lowercase(review)
    
    # Clean tweet
    review = remove_mention(review)
    review = remove_url(review)
    review = remove_hashtag(review)
    review = remove_emoji(review)
    review = replace_nonASCII(review)
        
    # Remove punctuation
    review = remove_punct(review)
    
    # Tokenization
    review = word_tokenize(review)
    
    # Remove stopwords
    review = remove_stopwords(review)
    
    #return review
    return " ".join(review)

Before text preprocessing, we would like to remain all the text in **English** only.

In [23]:
df_clean = df_clean.loc[df_clean['language'] == 'en']

In [24]:
cols = ['full_text', 'quote_text']

for c in cols:
    df_clean[c] = df_clean[c].apply(lambda x: datapreprocessing(x))

In [25]:
df_clean.sample(10)

Unnamed: 0,created_at,user_id,username,language,is_quote_status,is_retweet_status,full_text,quote_text
1168,2020-05-15 07:38:36,1261199288069058560,jeeennze,en,False,True,make mistake apologistoperative temporarily ma...,
470,2020-05-15 07:08:06,1261191610777440256,Jamal_Engel,en,True,False,justice coming,
1300,2020-05-15 07:45:23,1261200993171025921,calendarking,en,True,False,huge concern,scares crushes november rd much damage cause d...
163,2020-05-15 06:56:53,1261188788287496192,rich5819,en,False,True,,
10,2020-05-15 06:51:09,1261187344499179521,BT_India,en,False,False,listen india news podcast,
619,2020-05-15 07:13:55,1261193076124651522,chiamaluca,en,False,False,dumb portion us population could convince peop...,
116,2020-05-15 06:54:43,1261188241212026880,PaintTheCityBl1,en,False,True,one president united states guess one,
169,2020-05-15 06:57:00,1261188817517776896,nancy18097436,en,False,True,president calls former president questioned oa...,
820,2020-05-15 07:24:09,1261195648805658624,yojudenz,en,True,True,needs investigated asap people losing lives vi...,kube fkennnedy et rush look
39,2020-05-15 06:52:06,1261187586451746816,bitcoinconnect,en,False,False,guardian life names margherita l dimanni deput...,


In [26]:
# Save cleaned dataset
df_clean.to_csv('dataset/data_clean_0515.csv', index = False)