<h1>Preprocessing</h1>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup
import re

In [2]:
# Membaca file CSV 
df = pd.read_csv('./covid19tweets.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179108 entries, 0 to 179107
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         179108 non-null  object
 1   user_location     142337 non-null  object
 2   user_description  168822 non-null  object
 3   user_created      179108 non-null  object
 4   user_followers    179108 non-null  int64 
 5   user_friends      179108 non-null  int64 
 6   user_favourites   179108 non-null  int64 
 7   user_verified     179108 non-null  bool  
 8   date              179108 non-null  object
 9   text              179108 non-null  object
 10  hashtags          127774 non-null  object
 11  source            179031 non-null  object
 12  is_retweet        179108 non-null  bool  
 13  sentiments        179108 non-null  object
dtypes: bool(2), int64(3), object(9)
memory usage: 16.7+ MB


In [3]:
# Kolom yang kita perlukan hanya text
df.text

0         If I smelled the scent of hand sanitizers toda...
1         Hey @Yankees @YankeesPR and @MLB - wouldn't it...
2         @diane3443 @wdunlap @realDonaldTrump Trump nev...
3         @brookbanktv The one gift #COVID19 has give me...
4         25 July : Media Bulletin on Novel #CoronaVirus...
                                ...                        
179103    Thanks @IamOhmai for nominating me for the @WH...
179104    2020! The year of insanity! Lol! #COVID19 http...
179105    @CTVNews A powerful painting by Juan Lucena. I...
179106    More than 1,200 students test positive for #CO...
179107    I stop when I see a Stop\n\n@SABCNews\n@Izinda...
Name: text, Length: 179108, dtype: object

## 1) Menghilangkan Links

In [4]:
# Menggunakan lambda
df['clean_text'] = df['text'].apply(lambda s: ' '.join(re.sub(r"http\S+", "", s).split()))

In [5]:
df[['text', 'clean_text']].iloc[94807]

text          Some Unfortunate News\n https://t.co/VpKFmdkDB...
clean_text    Some Unfortunate News Awww well hopefully Meye...
Name: 94807, dtype: object

In [6]:
# Bisa juga menggunakan fungsi biasa
def remove_url(s) :
    #Menghapus url
    s = re.sub(r"http\S++", " ", s)

    #Menghapus spasi extra
    s = ' '.join(s.split())
    return s

df['clean_text'] = df['text'].apply(remove_url)
    

In [7]:
df[['text', 'clean_text']].iloc[94807]

text          Some Unfortunate News\n https://t.co/VpKFmdkDB...
clean_text    Some Unfortunate News Awww well hopefully Meye...
Name: 94807, dtype: object

# 2.Menghapus Tanda Baca

In [8]:
# Menggunakan lambda

df['clean_text'] = df['clean_text'].apply(lambda s: ' '.join(re.sub("[.,!?:;-='...@#_]", " ", s).split()))

In [9]:
# atau bisa menggunakan fungsi biasa
import string
def remove_punctuation(text):
    for pun in string.punctuation :
        text = text.replace(pun, "")
    return text

df['clean_text'] = df['clean_text'].apply(remove_punctuation)

In [10]:
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,If I smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,Hey Yankees YankeesPR and MLB wouldn t it hav...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane3443 wdunlap realDonaldTrump Trump never ...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv The one gift COVID19 has give me i...
4,25 July : Media Bulletin on Novel #CoronaVirus...,25 July Media Bulletin on Novel CoronaVirusUpd...


# 3. Menghapus Angka 

In [11]:
def remove_number (text) :
    text = re.sub(r'\d+', '', text)
    return text

df['clean_text'] = df['clean_text'].apply(remove_number)

In [12]:
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,If I smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,Hey Yankees YankeesPR and MLB wouldn t it hav...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane wdunlap realDonaldTrump Trump never once...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv The one gift COVID has give me is ...
4,25 July : Media Bulletin on Novel #CoronaVirus...,July Media Bulletin on Novel CoronaVirusUpdat...


In [13]:
# atau bisa menggunakan ini
df['clean_text'].replace('\d+', '', regex=True, inplace = True)
df[['text','clean_text']].head(20)

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,If I smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,Hey Yankees YankeesPR and MLB wouldn t it hav...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane wdunlap realDonaldTrump Trump never once...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv The one gift COVID has give me is ...
4,25 July : Media Bulletin on Novel #CoronaVirus...,July Media Bulletin on Novel CoronaVirusUpdat...
5,#coronavirus #covid19 deaths continue to rise....,coronavirus covid deaths continue to rise It s...
6,How #COVID19 Will Change Work in General (and ...,How COVID Will Change Work in General and recr...
7,You now have to wear face coverings when out s...,You now have to wear face coverings when out s...
8,Praying for good health and recovery of @Chouh...,Praying for good health and recovery of Chouha...
9,POPE AS GOD - Prophet Sadhu Sundar Selvaraj. W...,POPE AS GOD Prophet Sadhu Sundar Selvaraj Wat...


# 4. Menghilangkan semua emoji

In [14]:
def remove_emojis(text) :
    text = text.encode('ascii','ignore').decode('ascii')
    return text

df['clean_text'] = df['clean_text'].apply(remove_emojis)
df[['text','clean_text']].head(20)

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,If I smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,Hey Yankees YankeesPR and MLB wouldn t it hav...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane wdunlap realDonaldTrump Trump never once...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv The one gift COVID has give me is ...
4,25 July : Media Bulletin on Novel #CoronaVirus...,July Media Bulletin on Novel CoronaVirusUpdat...
5,#coronavirus #covid19 deaths continue to rise....,coronavirus covid deaths continue to rise It s...
6,How #COVID19 Will Change Work in General (and ...,How COVID Will Change Work in General and recr...
7,You now have to wear face coverings when out s...,You now have to wear face coverings when out s...
8,Praying for good health and recovery of @Chouh...,Praying for good health and recovery of Chouha...
9,POPE AS GOD - Prophet Sadhu Sundar Selvaraj. W...,POPE AS GOD Prophet Sadhu Sundar Selvaraj Wat...


# 5. Case Folding/Mengecilkan huruf

In [15]:
def lower(text) :
    text = text.lower()
    return text

df['clean_text'] = df['clean_text'].apply(lower)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,if i smelled the scent of hand sanitizers toda...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey yankees yankeespr and mlb wouldn t it hav...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane wdunlap realdonaldtrump trump never once...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv the one gift covid has give me is ...
4,25 July : Media Bulletin on Novel #CoronaVirus...,july media bulletin on novel coronavirusupdat...


# 6. Menghilangkan Stopwords

In [16]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nando\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
stop = set(stopwords.words('english'))
print(stop)

{'what', "mustn't", 'which', 'aren', 'up', 'at', 'how', 'more', 'herself', 'having', 'only', 'is', 'being', 'she', 'as', 'ain', 'mustn', 'your', 'isn', 'their', 'where', 'ours', 'above', 'be', 'couldn', 've', 'these', 'ma', 'just', 'needn', "shouldn't", 'other', 'now', 'both', 'and', 'shouldn', "needn't", 'between', 't', 'because', "aren't", 'doesn', 'you', 'then', 'here', 'same', 'in', 'was', 'own', 'very', 'can', 'who', 'he', 'been', 'some', 'if', 'has', 'after', "didn't", "couldn't", 'his', 'once', 'y', "you've", 'should', 'down', "isn't", "mightn't", 'the', 'during', 'hadn', "you'd", 'about', 's', 'by', 'our', 'haven', "doesn't", 'weren', 'them', 'from', 'on', 'not', 'any', 'll', 'do', 'are', 'out', 'under', 'does', 'of', 'until', 'a', 'an', 'so', "wasn't", 'each', 'few', 'wouldn', "that'll", "shan't", "don't", "you're", 'have', 'whom', 'further', 'but', 'hasn', 'too', 'were', 'for', 'with', "wouldn't", "haven't", "you'll", 'ourselves', 'when', 'to', 'didn', 'while', 'had', 'all', 

In [18]:
def remove_sw(text) :
    words = text.split()
    noise_free_word = []
    for word in words :
        if word not in stop :
            noise_free_word.append(word)
    noise_free_text = ' '.join(noise_free_word)
    return noise_free_text

df['clean_text'] = df['clean_text'].apply(remove_sw)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,smelled scent hand sanitizers today someone pa...
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,hey yankees yankeespr mlb made sense players p...
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,diane wdunlap realdonaldtrump trump never clai...
3,@brookbanktv The one gift #COVID19 has give me...,brookbanktv one gift covid give appreciation s...
4,25 July : Media Bulletin on Novel #CoronaVirus...,july media bulletin novel coronavirusupdates c...


# 7. Tokenizing / Mengubah kata menjadi bentuk array

In [19]:
#Menggunakan nltk
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

df['clean_text'] = df['clean_text'].apply(lambda x : tokenizer.tokenize(x))
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,"[smelled, scent, hand, sanitizers, today, some..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,"[hey, yankees, yankeespr, mlb, made, sense, pl..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,"[diane, wdunlap, realdonaldtrump, trump, never..."
3,@brookbanktv The one gift #COVID19 has give me...,"[brookbanktv, one, gift, covid, give, apprecia..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,"[july, media, bulletin, novel, coronavirusupda..."


# 8. Stemming/Lemmatizing (Mengubah menjadi kata dasar)

In [20]:
# Menggunakan Stemming
# from nltk.stem import PorterStemmer
# from nltk.tokenize import word_tokenize

# stemmer = PorterStemmer()

# def stemming(text) :
#     stemmed_word = []
#     for word in text :
#         stemmed_word.append(stemmer.stem(word))
#     text = stemmed_word
#     return text

# df['clean_text'] = df['clean_text'].apply(stemming)
# df[['text', 'clean_text']].head()    

In [21]:
# Menggunakan Lemmatizing
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.tag import pos_tag


lemmatizer = WordNetLemmatizer()
def lemmatizing(text) :
    lemmatized_word = []
    for word in text :
        lemmatized_word.append(lemmatizer.lemmatize(word, pos ='v'))
    text = lemmatized_word
    return text

# def lemmatizing(text) :
#     pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
#     pos_tags_list = pos_tag(text)
#     lemmatiser = WordNetLemmatizer()
    
#     tokens = [lemmatiser.lemmatize(w.lower(), pos=pos_map.get(p[0], 'v')) for w, p in pos_tags_list]
#     return tokens

df['clean_text'] = df['clean_text'].apply(lemmatizing)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,If I smelled the scent of hand sanitizers toda...,"[smell, scent, hand, sanitizers, today, someon..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,"[hey, yankees, yankeespr, mlb, make, sense, pl..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,"[diane, wdunlap, realdonaldtrump, trump, never..."
3,@brookbanktv The one gift #COVID19 has give me...,"[brookbanktv, one, gift, covid, give, apprecia..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,"[july, media, bulletin, novel, coronavirusupda..."


In [22]:
print(df['clean_text'].iloc[7])

['wear', 'face', 'cover', 'shop', 'include', 'visit', 'local', 'community', 'pharmacy']


In [23]:
# Menyimpannya ke file csv

# df[['clean_text', 'sentiments']].to_csv('tweets_clean2.csv', index=False)

In [24]:
df_positif = df[df['sentiments'] == 'positive'].sample(666)
df_netral = df[df['sentiments'] == 'neutral'].sample(667)
df_negatif = df[df['sentiments'] == 'negative'].sample(667)

df = pd.concat([df_positif, df_netral, df_negatif])
df = df.reset_index(drop=True)


In [25]:
df.shape

(2000, 15)

In [26]:
df.to_csv('tweets_clean2.csv', index=False)

In [27]:
# from nltk import word_tokenize, sent_tokenize, pos_tag
# def preprocess_text(text):
#     pos_map = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
    
#     pos_tags_list = pos_tag(text)
    
#     #print(pos_tags)
    
#     # 3. Lowercase and lemmatise
    
#     lemmatiser = WordNetLemmatizer()
    
#     tokens = [lemmatiser.lemmatize(w.lower(), pos=pos_map.get(p[0], 'v')) for w, p in pos_tags_list]
    
#     return tokens

In [28]:
# the_result = preprocess_text(['movies', 'amazing', 'overrated', 'supposed', 'lies','beens', 'taking','while'])

In [1]:
# the_result