In [1]:
import pandas as pd
import numpy as np

#* open file csv
df = pd.read_csv('TRANSLATED-covid-sentiment.csv')

#* ilangin kolom2 berikut
columns = ['mentions', 'conversation_id', 'user_id', 'hashtags']
df.drop(columns, inplace=True, axis=1)

In [2]:
# Just a list of imports :D

import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
import dask.dataframe as dd
from dask.multiprocessing import get
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# DATAPREPROCESSING: Data preprocessing and cleaning the tweets dataframe into a tokenized version :D

# dd is dask dataframe which utilized multiprocessing, cuz we have the NEED FOR SPEED :O
ddf = dd.from_pandas(df, npartitions=30)

def preProcess(row):
    tweet = row['tweet'].lower() # convert text to lower-case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
    
    listStopword =  set(stopwords.words('indonesian'))
    
    removed = []
    for t in tweet:
        if t not in listStopword:
            removed.append(t)

    tweet = stemmer.stem(tweet) #remove kata2 imbuhan
    tweet = word_tokenize(tweet) #transform the string to an array form
    print(tweet)
    return tweet

res = ddf.map_partitions(lambda df: df.apply(preProcess,axis=1)).compute(scheduler='processes')
df["tokenized"] = res
df.to_csv('TOKENIZED-covid-sentiment.csv')

In [None]:
# Calculate the amount of words in the whole dataframe :O

import ast
import dask.dataframe as dd
import nltk 
import itertools

df = pd.read_csv('TOKENIZED-covid-sentiment.csv')

# dd is dask dataframe which utilized multiprocessing, cuz we wanna know, HOW THEY LIVE IN TOKYO :O
ddf = dd.from_pandas(df, npartitions=30)

def removeQuotes(row):
    def perfectEval(anonstring):
        try:
            ev = ast.literal_eval(anonstring)
            return ev
        except ValueError:
            corrected = "\'" + anonstring + "\'"
            ev = ast.literal_eval(corrected)
            return ev
    
    x = perfectEval(row["tokenized"])
    type(x)
    return x

#NUMERO UNO: because the array is saved as a string, we need to make sure THERE ARE NO STRINGS ATTACHED >:O
res = ddf.map_partitions(lambda df: df.apply(removeQuotes, axis=1)).compute(scheduler='processes')
df["tokenized"] = res

all_words = []

#NUMERO DOS: my method is to list every word in an array above, so we can count how many is similar from that giant array
for index, row in df.iterrows():
    for val in row["tokenized"]:
        x=''.join(c[0] for c in itertools.groupby(val))
        all_words.append(x)

#NUMERO TRES: we list the wordlists so we get LIT >:D
wordlist = nltk.FreqDist(all_words)
word_features = wordlist.keys()

data = [] 

#NUMERO QUATRO: we count them words baby <3
for word in word_features:
    data.append([str(word), list(all_words).count(word)])

countdf = pd.DataFrame(data, columns = ['Word', 'Count']) 

countdf.to_csv('WORD-COUNT-covid-sentiment.csv')

In [None]:
# This is to simply sort the previous results :D

pd.set_option('display.max_rows', 50)
df = pd.read_csv('WORD-COUNT-covid-sentiment.csv')
df.sort_values(by="Count", ascending=False).to_csv('WORD-COUNT-SORTED-covid-sentiment.csv')

In [3]:
df = pd.read_csv('WORD-COUNT-SORTED-covid-sentiment.csv')
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Word,Count
0,11,11,covid,41098
1,0,0,perintah,40876
2,12,12,19,33379
3,93,93,indonesia,7473
4,27,27,tangan,6872
...,...,...,...,...
32093,17076,17076,aisokong,1
32094,17075,17075,bisar,1
32095,17074,17074,tuhdipakai,1
32096,17073,17073,jewajiban,1


In [7]:
# DATAPREPROCESSING: The positive & negative vocabulary are actin' dirty on me yo. So we need to clean em up boi :O !

posdf = pd.read_csv('positive.txt', names=["text"])
negdf = pd.read_csv('negative.txt', names=["text"])

posddf = dd.from_pandas(posdf, npartitions=30)
negddf = dd.from_pandas(negdf, npartitions=30)

def preProcess(row):
    return stemmer.stem(row["text"])

posres = posddf.map_partitions(lambda posdf: posdf.apply(preProcess,axis=1)).compute(scheduler='processes')
negres = negddf.map_partitions(lambda negdf: negdf.apply(preProcess,axis=1)).compute(scheduler='processes')

posdf.drop(["text"], inplace=True, axis=1)
posdf["cleaned"] = posres
posdf.drop_duplicates(subset ="cleaned",keep = False, inplace = True) 

negdf.drop(["text"], inplace=True, axis=1)
negdf["cleaned"] = negres
negdf.drop_duplicates(subset ="cleaned",keep = False, inplace = True) 

posdf.to_csv('positive-cleaned.csv')
negdf.to_csv('negative-cleaned.csv')

In [4]:
# FINALE: we find the sentiment simply by comparing the total amount of bad words and good words

posdf = pd.read_csv('positive-cleaned.csv')
negdf = pd.read_csv('negative-cleaned.csv')
df = pd.read_csv('WORD-COUNT-SORTED-covid-sentiment.csv')

posarray = posdf["cleaned"].to_numpy()
poscondition = df["Word"].isin(posarray)
postotal = np.where(poscondition, df['Count'],0).sum()
print("postotal: ", postotal)

negarray = negdf["cleaned"].to_numpy()
negcondition = df["Word"].isin(negarray)
negtotal = np.where(negcondition, df['Count'],0).sum()
print("negtotal: ", negtotal)

if(negtotal>postotal):
    print("overall sentiment is negative by", int((postotal-negtotal)*100/min(postotal, negtotal)),"%" )
else: 
    print("overall sentiment is positive by", int((postotal-negtotal)*100/min(postotal, negtotal)),"%" )

postotal:  63260
negtotal:  88512
overall sentiment is negative by -39 %
