In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from IPython.display import display

# Load preprocessed training and testing data
df_train = pd.read_csv('../data/processed/preprocessed_training.csv', delimiter=";")
df_test = pd.read_csv('../data/processed/preprocessed_testing.csv', delimiter=";")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train['stemming'])

# Transform testing data
X_test_tfidf = tfidf_vectorizer.transform(df_test['stemming'])

# Save TF-IDF Vectorizer
with open('../data/tfidf/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Save transformed training data
with open('../data/tfidf/tfidf_training.pkl', 'wb') as f:
    pickle.dump(X_train_tfidf, f)

# Save transformed testing data
with open('../data/tfidf/tfidf_testing.pkl', 'wb') as f:
    pickle.dump(X_test_tfidf, f)

def display_nonzero_tfidf(tfidf_matrix, feature_names, title, num_samples=5):
    df_tfidf = pd.DataFrame(tfidf_matrix[:num_samples].toarray(), columns=feature_names)
    df_nonzero = df_tfidf.loc[:, (df_tfidf != 0).any(axis=0)]  # Hanya menampilkan kolom dengan nilai tidak nol
    display(df_nonzero.head(num_samples))

# Show sample results
display_nonzero_tfidf(X_train_tfidf, tfidf_vectorizer.get_feature_names_out(), "📌 TF-IDF Features - Data Training")
display_nonzero_tfidf(X_test_tfidf, tfidf_vectorizer.get_feature_names_out(), "📌 TF-IDF Features - Data Testing")

Unnamed: 0,abar,abbas,abdul,abdullah,abdurrahman,abu,ada,agama,ahli,aisyah,...,uzza,wahai,wahyu,wajib,waroqoh,yahudi,yakin,yunus,yusuf,zuhri
0,0.0,0.0,0.0,0.0,0.0,0.026585,0.031915,0.024356,0.055778,0.0,...,0.0,0.015234,0.0,0.0,0.0,0.077208,0.0,0.026765,0.0,0.026833
1,0.031359,0.0,0.038171,0.051611,0.037262,0.039513,0.023718,0.0362,0.0,0.0,...,0.056538,0.067927,0.194664,0.0,0.401501,0.0,0.045487,0.039781,0.040853,0.079764
2,0.0,0.107059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.058138,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,...,0.0,0.0,0.146626,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,aku,allah,amru,anjing,baligh,benar,beri,bin,darah,diri,...,saibah,sembah,suami,sungguh,tarik,tato,tengah,tiga,tsulutsul,wanita
0,0.0,0.19002,0.0,0.0,0.0,0.342923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.237092,0.0,0.0,0.0,0.0,0.0,0.0
1,0.117692,0.0,0.302729,0.0,0.0,0.0,0.0,0.17087,0.0,0.0,...,0.531444,0.320811,0.0,0.0,0.366108,0.0,0.0,0.0,0.0,0.0
2,0.097235,0.0,0.0,0.0,0.380565,0.0,0.0,0.0,0.0,0.163941,...,0.0,0.0,0.0,0.135893,0.0,0.0,0.0,0.189231,0.454393,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.559866,0.0,0.0,0.0,0.253037,0.0,0.0,0.207127
4,0.0,0.0,0.0,0.23824,0.0,0.0,0.115818,0.0,0.207469,0.0,...,0.0,0.0,0.0,0.0,0.0,0.584541,0.0,0.0,0.0,0.0
