In [1]:
import csv
import re
import unicodedata
from collections import defaultdict

In [2]:
# Load data
def read_csv(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        return list(reader)

data = read_csv('gabung_all.csv')

In [3]:
# Check if 'text' is in data columns
if 'text' in data[0]:
    # Extract 'text' column from data
    kolom_case_folding = [row['text'] for row in data]

    # Function to separate hashtags
    def pisahkan_hashtag(kalimat):
        pola = r'#[A-Za-z0-9_]+'
        hasil = re.findall(pola, kalimat)
        for hashtag in hasil:
            kata_terpisah = re.sub(r'([a-z])([A-Z])', r'\1 \2', hashtag)
            kalimat = kalimat.replace(hashtag, kata_terpisah)
        return kalimat

    # Separate hashtags in each sentence in 'text' column
    for row in data:
        row['pisah_hashtag&kata'] = pisahkan_hashtag(row['text'])

else:
    print("Kolom 'text' tidak ada dalam dataframe.")

In [4]:
# Save results to CSV
def write_csv(file_path, data):
    with open(file_path, 'w', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

write_csv("no_library/handling_hashtag.csv", data)

In [5]:
# Function to remove non-ASCII characters
def remove_non_ascii(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text) if unicodedata.category(c) != 'Mn')

# Function to preprocess text
def preprocess_text(text):
    text = remove_non_ascii(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    kata_kata = text.split()
    kata_kata_tanpa_tanda_baca = [re.sub(r'[^\w\s-]', '', kata) for kata in kata_kata]
    kalimat_tanpa_tanda_baca = ' '.join(kata_kata_tanpa_tanda_baca)
    kalimat_final = re.sub(r'(?<=[^\w\s-])', ' ', kalimat_tanpa_tanda_baca)
    kalimat_tanpa_spasi_ganda = re.sub(r'\s+', ' ', kalimat_final)
    kalimat_tanpa_tanda_baca_dan_digit = re.sub(r'[^\w\s-]', '', kalimat_tanpa_spasi_ganda)
    kalimat_tanpa_digit = re.sub(r'\d+', '', kalimat_tanpa_tanda_baca_dan_digit)
    kalimat_final = kalimat_tanpa_digit.lower()
    kalimat_final = kalimat_final.strip()
    kalimat_final = re.sub(r"\b[a-zA-Z]\b", "", kalimat_final)
    return kalimat_final

# Preprocess text column
for row in data:
    row['text_cleaning'] = preprocess_text(row['pisah_hashtag&kata'])

In [6]:
# Normalisasi teks menggunakan kamus slang dari file
filename = "Final_1.txt"

with open(filename, "r", encoding='utf-8') as file:
    contents = file.read()
    slangs_dict = eval(contents)

def normalisasi_text(text, slangs):
    for slang, normal in slangs.items():
        pattern = r"\b{}\b".format(re.escape(slang))
        text = re.sub(pattern, normal, text)
    return text

# Apply normalization to the text_cleaning column
for row in data:
    row['text_normalisasi'] = normalisasi_text(row['text_cleaning'], slangs_dict)

In [7]:
# Tokenize words
def word_tokenize(text):
    return text.split()

for row in data:
    row['tweet_tokens'] = word_tokenize(row['text_normalisasi'])
    
print('Tokenizing Result : \n') 
for row in data[:5]:
    print(row['tweet_tokens'])

Tokenizing Result : 

['lho', 'dia', 'yang', 'minta', 'ibu', 'kota', 'negara', 'kok', 'otoritas', 'di', 'beban', 'iya', 'modarlah']
['belum', 'jadi', 'kok', 'sudah', 'mau', 'memindahkan', 'asn', 'pak', 'yakin', 'kalau', 'ibu', 'kota', 'negara', 'sudah', 'jadi', 'bapak', 'masih', 'jadi', 'menteri']
['sampai', 'di', 'ibu', 'kota', 'negara', 'asn', 'nya', 'mengeluhoala', 'disini', 'baru', 'seminggu', 'kerja', 'kena', 'demam', 'berdarah', 'dengue', 'malaria', 'tidak', 'biasa', 'dengar', 'suara', 'monyet', 'ular', 'dan', 'kadang', 'anak', 'tidak', 'setia', 'didepan', 'rumahsabar', 'iya', 'asn']
['emang', 'ibu', 'kota', 'negara', 'jadi', 'kapan', 'kayak', 'dia', 'masih', 'menjabat', 'saja', 'boro-boro', 'masih', 'ada']
['tidak', 'semudah', 'itu', 'asn', 'wajib', 'pindah', 'ke', 'ibu', 'kota', 'negara', 'tahun', 'berapa', 'investor', 'nya', 'juga', 'kabur', 'belum', 'jaminan', 'kantor', 'siap', 'sombong', 'banget']


In [8]:
write_csv("no_library/Tweet_all.csv", data)

In [9]:
# Load lexicon
def load_lexicon(file_path):
    lexicon = dict()
    with open(file_path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader)  # Skip header row
        for row in reader:
            lexicon[row[0]] = int(row[1])
    return lexicon

lexicon_positive = load_lexicon('lexicon_positive.csv')
lexicon_negative = load_lexicon('lexicon_negative.csv')

In [10]:
# Function to determine sentiment polarity of tweets
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    positive_found = []
    negative_found = []
    for word in text:
        if word in lexicon_positive:
            score += lexicon_positive[word]
            positive_found.append((word, lexicon_positive[word]))
        if word in lexicon_negative:
            score += lexicon_negative[word]
            negative_found.append((word, lexicon_negative[word]))
    polarity = ''
    if score > 0:
        polarity = 'positive'
    elif score < 0:
        polarity = 'negative'
    else:
        polarity = 'neutral'
    return score, polarity, positive_found, negative_found

# Determine sentiment polarity of tweets
results = [sentiment_analysis_lexicon_indonesia(row['tweet_tokens']) for row in data]
for i, result in enumerate(results):
    data[i]['polarity_score'] = result[0]
    data[i]['polarity'] = result[1]
    data[i]['positive_words'] = result[2]
    data[i]['negative_words'] = result[3]

In [11]:
data

[{'text': 'Lho!!!! Dia Yg Minta IKN , Kok Otorita. Di Bebani!! Ya Modarlah?',
  'label': 'negative',
  'pisah_hashtag&kata': 'Lho!!!! Dia Yg Minta IKN , Kok Otorita. Di Bebani!! Ya Modarlah?',
  'text_cleaning': 'lho dia yg minta ikn kok otorita di bebani ya modarlah',
  'text_normalisasi': 'lho dia yang minta ibu kota negara kok otoritas di beban iya modarlah',
  'tweet_tokens': ['lho',
   'dia',
   'yang',
   'minta',
   'ibu',
   'kota',
   'negara',
   'kok',
   'otoritas',
   'di',
   'beban',
   'iya',
   'modarlah'],
  'polarity_score': -15,
  'polarity': 'negative',
  'positive_words': [('minta', 2)],
  'negative_words': [('dia', -3),
   ('yang', -5),
   ('minta', -3),
   ('kota', -1),
   ('beban', -5)]},
 {'text': 'Belum jadi kok sudah mau memindahkan ASN Pak? Yakin kalau IKN sdh jadi bapak masih jadi Menteri?',
  'label': 'negative',
  'pisah_hashtag&kata': 'Belum jadi kok sudah mau memindahkan ASN Pak? Yakin kalau IKN sdh jadi bapak masih jadi Menteri?',
  'text_cleaning': '

In [12]:
write_csv('no_library/Hasil_tweets_data.csv', data)

In [13]:
# Print polarity counts
polarity_counts = defaultdict(int)
for row in data:
    polarity_counts[row['polarity']] += 1

print(polarity_counts)

defaultdict(<class 'int'>, {'negative': 10633, 'neutral': 402, 'positive': 1485})


In [14]:
# Visualization (without matplotlib)
def simple_pie_chart(counts, title):
    total = sum(counts.values())
    print(f"{title}\n")
    for label, count in counts.items():
        print(f"{label}: {count / total:.1%} ({count})")

simple_pie_chart(polarity_counts, 'Sentiment Polarity on Tweets Data')

Sentiment Polarity on Tweets Data

negative: 84.9% (10633)
neutral: 3.2% (402)
positive: 11.9% (1485)


In [15]:
# Save positive and negative tweets
positive_tweets = [row for row in data if row['polarity'] == 'positive']
positive_tweets.sort(key=lambda x: x['polarity_score'], reverse=True)

In [16]:
negative_tweets = [row for row in data if row['polarity'] == 'negative']
negative_tweets.sort(key=lambda x: x['polarity_score'])

In [17]:
# Visualize word cloud (without wordcloud library)
from collections import Counter

def generate_word_cloud(words):
    word_counts = Counter(words)
    for word, count in word_counts.most_common(20):
        print(f"{word}: {count}")

all_words = [word for row in data for word in row['tweet_tokens']]
generate_word_cloud(all_words)

def words_with_sentiment(text):
    positive_words = []
    negative_words = []
    for word in text:
        if word in lexicon_positive:
            positive_words.append(word)
        if word in lexicon_negative:
            negative_words.append(word)
    return positive_words, negative_words

sentiment_words = [words_with_sentiment(row['tweet_tokens']) for row in data]
positive_words = [word for sublist in sentiment_words for word in sublist[0]]
negative_words = [word for sublist in sentiment_words for word in sublist[1]]


negara: 14264
kota: 13950
ibu: 13805
yang: 5442
tidak: 4587
di: 4536
dan: 3480
ada: 2044
ini: 1881
itu: 1860
saja: 1757
jadi: 1722
dari: 1698
kalau: 1657
ke: 1513
sudah: 1502
untuk: 1489
rakyat: 1474
dengan: 1396
iya: 1282


In [18]:
print("Positive Words Word Cloud")
generate_word_cloud(positive_words)

print("Negative Words Word Cloud")
generate_word_cloud(negative_words)

Positive Words Word Cloud
ada: 2044
jadi: 1722
sudah: 1502
mau: 1275
presiden: 928
buat: 892
goreng: 890
sama: 811
lebih: 780
pindah: 671
banyak: 668
dalam: 496
sampai: 433
baik: 421
tahu: 417
bawa: 378
para: 362
punya: 358
besar: 356
dana: 350
Negative Words Word Cloud
kota: 13950
yang: 5442
tidak: 4587
ada: 2044
itu: 1860
saja: 1757
jadi: 1722
dari: 1698
kalau: 1657
sudah: 1502
mau: 1275
apa: 1120
lagi: 1075
goreng: 890
saya: 873
karena: 842
bukan: 708
pindah: 671
banyak: 668
air: 617
