In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer  # Pastikan ini diimpor

# Unduh stopwords jika belum ada
nltk.download('stopwords')

# Baca file CSV
df = pd.read_csv('data/google-play-rev-gen-2.csv')

# Cek kolom yang ada dalam dataset
print(df.columns)

# Tampilkan beberapa baris pertama data untuk memastikan data ter-load dengan benar
print(df.head(10))

# Inisialisasi stopwords dan stemmer
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

# Preprocessing function
def preprocess_text(text):
    # Menghapus karakter khusus, angka, dan tanda baca
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenisasi: Pisahkan teks menjadi list kata
    tokens = text.lower().split()
    
    # Menghapus stopwords dan stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    # Gabungkan kembali tokens yang telah diproses menjadi teks yang bersih
    return ' '.join(processed_tokens)

# Pastikan nama kolom 'snippet' sesuai dengan data yang ada
# Terapkan preprocessing pada kolom 'snippet' yang berisi ulasan
df['cleaned_snippet'] = df['snippet'].apply(preprocess_text)

# Tampilkan contoh hasil setelah preprocessing
print(df[['snippet', 'cleaned_snippet']].head())

# Inisialisasi TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words=stop_words)

# Fit dan transform data ulasan yang telah diproses menggunakan TF-IDF
X_tfidf = vectorizer.fit_transform(df['cleaned_snippet'])  # Gunakan 'cleaned_snippet' yang sudah diproses

# Ambil fitur kata kunci dari hasil TF-IDF
tfidf_features = vectorizer.get_feature_names_out()

# Konversi hasil TF-IDF menjadi DataFrame agar lebih mudah dibaca
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_features)

# Menampilkan hasil TF-IDF
print(tfidf_df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rebecca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['id', 'title', 'avatar', 'rating', 'snippet', 'likes', 'date',
       'iso_date', 'response'],
      dtype='object')
                                     id                       title  \
0  e5384431-56f9-43fa-a32a-53296afc7f66                    Seraphim   
1  6a73081f-3490-47ba-89fa-83744cb20940                  TWOSTORE !   
2  3a3d4c90-0b6e-45dc-b1e6-014659055bbf                         A G   
3  99015538-1d26-4bd9-a02f-37bc2a361d1a                       Astra   
4  541b3b4d-97f6-42e0-9c68-059a63e1e67f             Angela Williams   
5  ad484b6a-1b9c-42ab-9cee-9df6e28f12d4                     Valerie   
6  fb25cdf6-40d8-44e8-b1b9-d439ebd88565  Daniel “Chotara” Ricciardi   
7  36e821d3-9441-4eaa-94a7-9c7b9b7463b5                         Amy   
8  4b1e6dcb-d251-450e-9be6-358b4bb8e9d6                 Feitan Desy   
9  27525772-1c0a-40e4-8321-4c5f0a0f7c64            Olivia Staringer   

                                              avatar  rating  \
0  https://play-lh.googleuserc

InvalidParameterError: The 'stop_words' parameter of TfidfVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'to', 'doing', "weren't", 'ours', 'between', 'once', 't', 'under', 'until', 'couldn', 'most', 'having', 'there', 'shan', 'theirs', 'which', 'now', 'my', 'more', 'him', "hasn't", 'very', 'all', 'further', 'because', 'will', 'these', "needn't", 's', 'is', 'has', 'be', 'he', 'won', 'during', 'above', 'at', 'have', 'each', 'with', 'can', 'hadn', 'up', 'those', 'do', 'ain', 'been', 'needn', "you're", 'if', 'through', 'ma', 'into', 'just', "it's", "isn't", 'her', 'other', 'no', 'did', 'below', 'wasn', 'such', "hadn't", 'them', 'does', 'll', "wasn't", "that'll", 'who', 'nor', 're', 'their', 'yourself', 'had', 've', 'of', 'again', 'after', 'haven', "shouldn't", 'mightn', "couldn't", 'itself', 'why', 'were', 'or', 'me', 'how', 'don', "should've", 'it', 'being', 'this', 'where', 'aren', 'for', 'few', 'too', 'a', 'out', 'but', 'an', 'same', 'should', 'in', 'its', "didn't", 'doesn', 'over', 'then', "mightn't", "she's", 'mustn', 'both', "haven't", 'your', 'about', 'she', "wouldn't", 'when', 'while', 'they', 'weren', 'down', 'not', 'hasn', 'here', 'we', 'that', "you'll", 'by', 'myself', 'whom', 'ourselves', 'themselves', 'the', 'what', "doesn't", 'yours', 'yourselves', 'before', 'didn', "mustn't", 'are', 'his', 'himself', 'so', 'from', "you've", 'm', 'against', 'i', 'was', 'am', 'shouldn', 'hers', 'some', "shan't", 'and', 'you', 'd', 'o', 'our', "aren't", "won't", 'isn', 'as', 'only', 'any', 'herself', 'wouldn', "you'd", "don't", 'off', 'y', 'than', 'on', 'own'} instead.