## Import Packages

In [None]:
import nltk
import numpy as np
import pandas as pd

import re                                  # library for regular expression operations
import string                              # for string operations

from deep_translator import GoogleTranslator
import stopwordsiso as stopwords           # module for stop words
from nltk.stem import PorterStemmer, WordNetLemmatizer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

## Read Dataset

**Raw Twitter Data March 2020 - December 2020**

In [None]:
data_1 = pd.read_csv(r'project\2020Mar-2020Dec_raw-tweets.csv');
data_1.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Retweets,Hashtags,Language
0,0,2020-12-30 13:53:43+00:00,clydebaltz,"""ganda talaga pag online class kase makakatipi...",1,0,,tl
1,1,2020-12-30 12:03:29+00:00,Shuwbeeeeee,Online class* Dami pang pending activities htt...,0,0,,en
2,2,2020-12-30 08:29:38+00:00,daksprincess,Mag 2021 na back to school nasad ay bck to onl...,0,0,,tl
3,3,2020-12-30 07:18:21+00:00,caamsahamnida,"Wala dyud ko kaila aning mga ga ""hi maam"" sako...",0,0,,tl
4,4,2020-12-30 06:15:09+00:00,CydLouie_,Sobrang fucked up ng online class. Naiinis na ...,6,0,,tl


**Raw Twitter Data January 2021 - December 2021**

In [None]:
data_2 = pd.read_csv(r'project\2021Jan-2021Dec_raw-tweets.csv');
data_2.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Hashtags,Language
0,0,2021-12-30 21:01:53+00:00,emeeny,is still gradually increasing. Ayoko na po mat...,0,,tl
1,1,2021-12-30 17:47:20+00:00,la_graciaa,@aimie1109 ako din kaso back to online class n...,1,,tl
2,2,2021-12-29 15:50:25+00:00,saimallow,"For some reasons, this year has been, for me, ...",0,,en
3,3,2021-12-29 11:08:55+00:00,a7dcmanv37,Online class pa tangina walang top performing ...,0,,tl
4,4,2021-12-28 16:19:53+00:00,ArvinOcampo13,"#NahanapNaSiArvin NAHANAP KONA SARILI KO, NEED...",2,['NahanapNaSiArvin'],tl


**Raw Twitter Data January 2022 - December 2022**

In [None]:
data_3 = pd.read_csv(r'project\2022Jan-2022Dec_raw-tweets.csv');
data_3.head()

Unnamed: 0.1,Unnamed: 0,Date,User,Tweet,Likes,Retweets,Hashtags,Language
0,0,2022-12-07 22:00:00+00:00,ActsReviewCtr,Congratulations to BEATRICE ISABELLE UY from 2...,1,0,,en
1,1,2022-12-07 14:44:32+00:00,ActsReviewCtr,Congratulations to MARIA LOREINA CRUZ from 202...,1,0,,en
2,2,2022-12-07 10:01:56+00:00,AFManille,📢 [TRIAL CLASS IN DECEMBER]\nEager to embark o...,1,0,['AFMTrials'],en
3,3,2022-12-07 09:31:03+00:00,ara_d_here,@_artbacccjip ay huhu ga-hotspot man gud ko ka...,0,0,,tl
4,4,2022-12-07 08:27:34+00:00,LeenLuckyWins,"Antok na antok akooo, idlip muna tas may onlin...",0,0,,tl


## PRE PROCESSING

### Remove Punctuations, digits, hyperlinks, Twitter marks and styles
Some tweets are composed of text with hashtags, retweet marks, and hyperlinks. Regular expressions will be used to remove them from a tweet.

In [None]:
def remove_hyperlinks_marks_styles(tweet):
    
    # remove old style retweet text "RT"
    new_tweet = re.sub(r'^RT[\s]+', '', tweet)

    # remove hyperlinks and mentions
    new_tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", new_tweet)
    
    # remove hashtags-only removing the hash # sign from the word
    new_tweet = re.sub(r'#', '', new_tweet)
    
    # remove digits
    new_tweet = re.sub(r'[\d-]', '', new_tweet)
    
    # remove punctuations
    new_tweet = re.sub(r'[^\w\s]', '', new_tweet)
    
    # remove extra space
    new_tweet = re.sub(' +', ' ', new_tweet)
    
    return new_tweet


### Tokenize the string
Split a string into individual words.

In [None]:
# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

def tokenize_tweet(tweet):
    # translate Filipino words to english language
    # word_lang = (translator.detect(txt_3)).lang 
    # tweet_trans = translator.translate(tweet, src=word_lang, dest='en').text
    tweet_trans = GoogleTranslator(source='auto', target='en').translate(tweet)
    
    tweet_tokens = tokenizer.tokenize(tweet_trans)
    return tweet_tokens

### Remove stop words and punctuations
Remove stop words and punctuations. Stop words are words that don't add significant meaning to the text. For example, 'i' and 'me'.

In [None]:
#Import the english and tagalog stop words list from NLTK
stop_words = stopwords.stopwords(["en", "tl"])
punctuations = string.punctuation

def remove_stopwords_punctuations(tweet_tokens):
    tweets_clean = []
    
    for word in tweet_tokens:
        if (word not in stop_words and word not in punctuations):
            tweets_clean.append(word)
            
    return tweets_clean

### Lemmatize

In [None]:
lemmatizer = WordNetLemmatizer()   
#an instance of Word Net Lemmatizer


def lemmatize_text(tweets_clean):
    
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tweets_clean] 
    
    return lemmatized_words

### Stemming

In [None]:
stemmer = PorterStemmer()

def get_stem(tweets_clean):
    
    tweets_stem = []
    
    for word in tweets_clean:
        stem_word = stemmer.stem(word)
        tweets_stem.append(stem_word)
        
    return tweets_stem


### Remove Less 2 Character Strings

In [None]:
def remove_to2_Char(tweets_clean):
    
    tweets = []
    
    for word in tweets_clean:
        if len(word) > 2:
            tweets.append(word)
        
    return tweets

In [None]:
tweet_example = data_1['Tweet'].iloc[722]
print(tweet_example)

processed_tweet = remove_hyperlinks_marks_styles(tweet_example)
print("\nRemoved hyperlinks, Twitter marks and styles:")
print(processed_tweet)

tweet_tokens = tokenize_tweet(processed_tweet)
print("\nTokenize the string:")
print(tweet_tokens)

tweets_clean = remove_stopwords_punctuations(tweet_tokens)
print("\nRemove stop words")
print(tweets_clean)

tweets_lemma = lemmatize_text(tweets_clean)
print("\nGet lemma of each word:")
print(tweets_lemma)

tweets_stem = get_stem(tweets_lemma)
print("\nGet stem of each word:")
print(tweets_stem)

tweets_char_remove = remove_to2_Char(tweets_stem)
print("\nRemove 1 and 2 character words:")
print(tweets_char_remove)

Online class really got me complaining bout stuffs then still I’d comply 😑😬

Removed hyperlinks, Twitter marks and styles:
Online class really got me complaining bout stuffs then still Id comply 

Tokenize the string:
['online', 'class', 'really', 'got', 'me', 'complaining', 'bout', 'stuffs', 'then', 'still', 'id', 'comply']

Remove stop words
['online', 'class', 'complaining', 'bout', 'stuffs', 'comply']

Get lemma of each word:
['online', 'class', 'complaining', 'bout', 'stuff', 'comply']

Get stem of each word:
['onlin', 'class', 'complain', 'bout', 'stuff', 'compli']

Remove 1 and 2 character words:
['onlin', 'class', 'complain', 'bout', 'stuff', 'compli']


### Pre Process Main Function

In [None]:
def process_tweet(tweet):
    processed_tweet = remove_hyperlinks_marks_styles(tweet)
    tweet_tokens = tokenize_tweet(processed_tweet)
    tweets_clean = remove_stopwords_punctuations(tweet_tokens)
    tweets_lemma = lemmatize_text(tweets_clean)
    tweets_stem = get_stem(tweets_lemma)
    tweets_char_remove = remove_to2_Char(tweets_stem)
    
    return ' '.join(tweets_char_remove)

In [None]:
def remove_dup_empty_rows(tweet_data):
    tweet_data = tweet_data.drop_duplicates('Processed Tweets')
    
    # delete row with empty cell
    tweet_data['Processed Tweets'].replace('', np.nan, inplace=True)
    
    tweet_data = tweet_data.dropna(subset=['Processed Tweets'])
    tweet_data = tweet_data.reset_index(drop=True)
    
    return tweet_data

### Year 1 Processed Data

In [None]:
processed_data_1 = pd.DataFrame()
processed_data_1['index'] = data_1.index

In [None]:
processed_data_1['Processed Tweets'] = data_1['Tweet'].apply(process_tweet)

In [None]:
display(processed_data_1)

Unnamed: 0,index,Processed Tweets
0,0,nice onlin class save money ver trsr nct boyz ...
1,1,onlin class pend activ
2,2,school nasad onlin class hahahaha
3,3,guy hard familiar onlin class haha
4,4,onlin class fuck sick
...,...,...
13127,13127,onlin class studi lesson
13128,13128,blackboard onlin class brownout cut class
13129,13129,peopl angri onlin class week week term
13130,13130,karmi iarmi class trend hate onlin attack cons...


### Year 2 Processed Data

In [None]:
processed_data_2 = pd.DataFrame()
processed_data_2['index'] = data_2.index

In [None]:
processed_data_2['Processed Tweets'] = data_2['Tweet'].apply(process_tweet)

In [None]:
display(processed_data_2)

Unnamed: 0,index,Processed Tweets
0,0,gradual increas close border late decemb covid...
1,1,onlin class shet yawqna hahahahahahahaha lol
2,2,reason fastest ata wala masyadong ganap life o...
3,3,onlin class perform school amp
4,4,nanapansiarvin onlin class hhahhaahahh
...,...,...
1791,1791,love onlin class school hahahahahahah
1792,1792,quarter sem memor sched onlin class
1793,1793,badiday recess onlin class start
1794,1794,drink coffe onlin class fall asleep class


### Year 3 Processed Data

In [None]:
processed_data_3 = pd.DataFrame()
processed_data_3['index'] = data_3.index

In [None]:
processed_data_3['Processed Tweets'] = data_3['Tweet'].apply(process_tweet)

In [None]:
display(processed_data_3)

Unnamed: 0,index,Processed Tweets
0,0,congratul beatric isabel onlin class batch pas...
1,1,congratul maria loreina cruz onlin class batch...
2,2,trial class decemb eager embark french adventu...
3,3,hey hotspot laptop onlin class money onlin ftf...
4,4,sleepi nap onlin class tomorrow
...,...,...
2314,2314,onlin class lazi studi hard
2315,2315,onlin class guy
2316,2316,forgot onlin class
2317,2317,smell onlin class season hope


### Save to csv file

In [None]:
processed_data_1.to_csv('2020Mar-2020Dec_processed-tweets.csv')
processed_data_2.to_csv('2021Mar-2021Dec_processed-tweets.csv')
processed_data_3.to_csv('2022Jan-2022Dec_processed-tweets.csv')

### Combine all processed dataset 2020 - 2022

In [None]:
combined_processed_data = pd.DataFrame()

In [None]:
combined_processed_data = combined_processed_data.append(processed_data_1, ignore_index = True)

In [None]:
combined_processed_data = combined_processed_data.append(processed_data_2, ignore_index = True)

In [None]:
combined_processed_data = combined_processed_data.append(processed_data_3, ignore_index = True)

In [None]:
display(combined_processed_data)

Unnamed: 0,index,Processed Tweets
0,0,nice onlin class save money ver trsr nct boyz ...
1,1,onlin class pend activ
2,2,school nasad onlin class hahahaha
3,3,guy hard familiar onlin class haha
4,4,onlin class fuck sick
...,...,...
17242,2314,onlin class lazi studi hard
17243,2315,onlin class guy
17244,2316,forgot onlin class
17245,2317,smell onlin class season hope


In [None]:
clean_data = remove_dup_empty_rows(combined_processed_data)
display(clean_data)

Unnamed: 0,index,Processed Tweets
0,0,nice onlin class save money ver trsr nct boyz ...
1,1,onlin class pend activ
2,2,school nasad onlin class hahahaha
3,3,guy hard familiar onlin class haha
4,4,onlin class fuck sick
...,...,...
13514,2244,clr playlist onlin class
13515,2245,onlin class tomorrow bye socm feel send flower...
13516,2265,tbh prefer onlin class situat safe jusko schoo...
13517,2266,putek troubl studi return onlin class rant cou...


### Check for Duplicate and Empty Rows

In [None]:
clean_data.isnull().sum()

index               0
Processed Tweets    0
dtype: int64

In [None]:
def dup_rows_index(df):
    dup = df[df.duplicated()]
    print('Duplicated index loc:',dup[dup == True ].index.tolist())
    return dup

In [None]:
dup_rows_index(clean_data)

Duplicated index loc: []


Unnamed: 0,index,Processed Tweets


### Save combined cleaned processed dataset to csv file

In [None]:
clean_data.to_csv('combined_processed-tweets(translated).csv')