In [None]:
import math
import pandas as pd
from collections import Counter

In [None]:
# Membaca dataset CSV
dataset = pd.read_csv('synonym_replaced_dataset.csv')

In [None]:
# Mengubah kolom Tweet dari string ke list
dataset['Tweet'] = dataset['Tweet'].apply(eval)

In [None]:
# Proses I: Menghitung Term Frequency (TF)
def compute_tf(dataset):
    tf_list = []
    for tweet in dataset['Tweet']:
        tf = Counter(tweet)
        tf_list.append(tf)
    return tf_list

In [None]:
# Proses II: Menghitung Document Frequency (DF)
def compute_df(dataset):
    df = Counter()
    for tweet in dataset['Tweet']:
        unique_words = set(tweet)
        df.update(unique_words)
    return df

In [None]:
# Proses III: Menghitung Inverse Document Frequency (IDF)
def compute_idf(df, n):
    idf = {}
    for word, freq in df.items():
        idf[word] = math.log(n / (freq + 1))  # ditambah 1 untuk menghindari pembagian dengan nol
    return idf

In [None]:
# Proses IV: Menghitung TF-IDF
def compute_tfidf(tf_list, idf):
    tfidf_list = []
    for tf in tf_list:
        tfidf = {word: tf[word] * idf[word] for word in tf}
        tfidf_list.append(tfidf)
    return tfidf_list

In [None]:
# Menjalankan proses
n = len(dataset)
tf_list = compute_tf(dataset)
df = compute_df(dataset)
idf = compute_idf(df, n)
tfidf_list = compute_tfidf(tf_list, idf)

In [None]:
# Menyimpan hasil ke dalam file CSV
df_df = pd.DataFrame.from_dict(df, orient='index', columns=['DF'])
df_df.to_csv('df_results.csv')

idf_df = pd.DataFrame.from_dict(idf, orient='index', columns=['IDF'])
idf_df.to_csv('idf_results.csv')

tf_df = pd.DataFrame(tf_list)
tf_df.to_csv('tf_results.csv', index=False)

tfidf_df = pd.DataFrame(tfidf_list)
tfidf_df.to_csv('tfidf_results.csv', index=False)

# Menampilkan hasil TF-IDF
for i, tfidf in enumerate(tfidf_list):
    print(f"TF-IDF untuk Tweet {i+1}: {tfidf}")


TF-IDF untuk Tweet 1: {'kampus': 1.0956227498197437, 'keren': 2.735366011749173, 'banget': 1.2929821839782387, 'fasilitas': 2.6330871626287546, 'lengkap': 2.7508701982851385}
TF-IDF untuk Tweet 2: {'duh': 3.0596056799347515, 'tugas': 0.818443399567112, 'kuliah': 2.3045830956567186, 'numpuk': 3.172083663361442, 'banget': 1.2929821839782387, 'pusing': 3.125563647726549}
TF-IDF untuk Tweet 3: {'tugas': 0.818443399567112, 'kerja': 2.958509563063383, 'minggu': 2.1057322369115536, 'seru': 1.9824995964876055}
TF-IDF untuk Tweet 4: {'kanan': 2.3045830956567186, 'dosen': 1.4630159099785, 'asih': 5.523458920524919, 'nilai': 3.865230843921387, 'sulit': 4.9638431325894965, 'mahasiswa': 3.6908774567766094, 'kalo': 3.865230843921387, 'cari': 4.712528704308591, 'kuliah': 2.3045830956567186}
TF-IDF untuk Tweet 5: {'tau': 6.2166061010848646, 'anjyr': 6.2166061010848646, 'muak': 6.2166061010848646, 'gue': 7.730461687842774, 'ajar': 3.7742590657156603, 'bisnis': 4.712528704308591, 'internasional': 16.570