<a href="https://colab.research.google.com/github/Luseat/Sentimen-aplikasi-gojek/blob/main/Analisis_Sentimen_apk_gojek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report
import nltk

nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud, STOPWORDS
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:

# mengambil dataset dari link raw github yang diambil dari playstore apk tiktok
url = "https://raw.githubusercontent.com/Luseat/Sentimen-aplikasi-gojek/refs/heads/main/GojekAppReviewV4.0.0-V4.9.3_Cleaned.csv"
df = pd.read_csv(url)
df

Unnamed: 0,userName,content,score,at,appVersion
0,Yuga Edit,akun gopay saya di blok,1,1/21/2022 10:52,4.9.3
1,ff burik,Lambat sekali sekarang ini bosssku apk gojek g...,3,11/30/2021 15:40,4.9.3
2,Anisa Suci Rahmayuliani,Kenapa sih dari kemarin sy buka aplikasi gojek...,4,11/29/2021 22:58,4.9.3
3,naoki yakuza,Baru download gojek dan hape baru trus ditop u...,1,9/3/2022 15:21,4.9.3
4,Trio Sugianto,Mantap,5,1/15/2022 10:05,4.9.3
...,...,...,...,...,...
224997,Sad Gamer,Gofood Biaya lain2ya gak ngotak mending hujan2...,1,2/15/2023 9:37,4.0.0
224998,fadhil fadil,Yok lah,5,12/11/2021 12:28,4.0.0
224999,g sugiarto,Sempurna,5,3/14/2022 1:45,4.0.0
225000,J i H A D 'B E,GOJEK LAMA LAMA GAK JELAS LAGI PESEN MASA MAP...,1,12/24/2021 8:48,4.0.0


In [3]:

df = df[['content', 'score']]
df

Unnamed: 0,content,score
0,akun gopay saya di blok,1
1,Lambat sekali sekarang ini bosssku apk gojek g...,3
2,Kenapa sih dari kemarin sy buka aplikasi gojek...,4
3,Baru download gojek dan hape baru trus ditop u...,1
4,Mantap,5
...,...,...
224997,Gofood Biaya lain2ya gak ngotak mending hujan2...,1
224998,Yok lah,5
224999,Sempurna,5
225000,GOJEK LAMA LAMA GAK JELAS LAGI PESEN MASA MAP...,1


In [4]:
df.shape # total ada 398 data dan 2 kolom (content dan score)

(225002, 2)

In [5]:
# menghinlangkan data yang sama
df = df.drop_duplicates(subset=['content'])

In [6]:
df = df.dropna() #menghilangkan data kosong

In [7]:

df.shape

(138812, 2)

In [8]:

def clean_gojek_data(text):
  text = re.sub(r'@[A-Za-z0-9_]+', '', text) # menghilangkan mention
  text = re.sub(r'#\w+', '', text) # menghilangkan hashtag
  text = re.sub(r'RT\s+', '', text) # menghilangkan retweet
  text = re.sub(r'http\S+', '', text) # menghilangkan link

  text = re.sub(r'[^A-Za-z0-9 ]', '', text) # menghilangkan karakter khusus
  text = re.sub(r'\s+', ' ', text).strip( ) # menghilangkan spasi ganda

  return text

df['content'] = df['content'].apply(clean_gojek_data)

In [9]:

df['content'] = df['content'].str.lower()

In [10]:
df.head()

Unnamed: 0,content,score
0,akun gopay saya di blok,1
1,lambat sekali sekarang ini bosssku apk gojek g...,3
2,kenapa sih dari kemarin sy buka aplikasi gojek...,4
3,baru download gojek dan hape baru trus ditop u...,1
4,mantap,5


In [11]:
score = df['score']
score

Unnamed: 0,score
0,1
1,3
2,4
3,1
4,5
...,...
224989,5
224991,1
224997,1
224998,5


In [12]:
def filter_tokens_by_length(dataframe, column, min_words, max_words):
  words_count = dataframe[column].astype(str).apply(lambda x: len(x.split()))
  mask = (words_count >= min_words) & (words_count <= max_words)
  filtered_df = dataframe[mask]
  return filtered_df


min_words = 3 # minimal 3 kata dibawah 3 maka dihapus
max_words = 50 # minimal 50 kata diatas 50 lebih maka dihapus
df = filter_tokens_by_length(df, 'content', min_words, max_words)

In [13]:

# normalisasi
norm = {' gk ' : ' tidak ', ' profisional ' : ' profesional ', ' skrng ' : ' sekarang ',' uwang ' : ' uang ', ' tiktokan ' : ' tiktok ', ' yg ' : ' yang ', ' udh ' : ' udah ', ' wkwk ' : '  ', ' min ' : ' kak ', ' malem ' : ' malam ', ' malem2 ' : ' malam ', ' sma ' : ' sama ', ' dgn ' : ' dengan ', ' muter ' : ' putar ' }

def normalisasi(str_text):
  for i in norm:
    str_text = str_text.replace(i, norm[i])
  return str_text

df['content'] = df['content'].apply(lambda x: normalisasi(x))

In [14]:
# Stopword
!pip install Sastrawi
import Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
more_stop_word = []

stop_words =  StopWordRemoverFactory().get_stop_words()
stop_words.extend(more_stop_word)

new_array = ArrayDictionary(stop_words)
stop_words_remover_new = StopWordRemover(new_array)

def stop_words(str_text):
  str_text = stop_words_remover_new.remove(str_text)
  return str_text

df['content'] = df['content'].apply(lambda x: stop_words(x))


Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [15]:
# Tokenize
tokenized = df['content'].apply(lambda x: x.split())
tokenized

Unnamed: 0,content
0,"[akun, gopay, di, blok]"
1,"[lambat, sekali, sekarang, bosssku, apk, gojek..."
2,"[sih, kemarin, sy, buka, aplikasi, gojek, mala..."
3,"[baru, download, gojek, hape, baru, trus, dito..."
8,"[gimana, kak, pin, salah, terus, padahal, udah..."
...,...
224986,"[sekarang, promo, voucher, bisa, dipakai, bare..."
224988,"[bapak, gojeknya, baik, mengantar, sabar, menc..."
224989,"[makin, kesini, makin, mahal, voucher2nya, mak..."
224991,"[harus, perbaruan, mulu, hedeh, payah]"


In [None]:
# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text_cleaning):
  factory =  StemmerFactory()
  stemmer = factory.create_stemmer()
  do = []
  for w in text_cleaning:
    dt = stemmer.stem(w)
    do.append(dt)
  d_clean = []
  d_clean = " ".join(do)
  return d_clean

tokenized = tokenized.apply(stemming) # untuk menghapus yang  memiliki kata imbuhan dan hanya befokus ke kolom konten yang sebelumnya sudah di pisahkan di bagian score = df['score']


In [None]:
tokenized # nantinya score dan conten akan di gabungkan lagi

In [None]:
score

In [None]:
# Menggabungkan kedua Dataframe berdasarkan index
score.drop(columns=["Unname: 0"], inplace = True)
df = pd.concat([tokenized, score], axis=1)
df.head()

In [None]:
def label_sentimen(score):
  if score <= 2:
    return 'negatif'
  else:
    return 'positif'

df['sentimen'] = df['score'].apply(label_sentimen)

In [None]:

df.head()

In [None]:
sentiment_counts = df.sentimen.value_counts()
sentiment_counts

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

# Visualization

sns.set_palette("pastel")
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sentimen')
plt.title('Jumlah Distribusi Sentimen')
plt.xlabel("Sentimen")
plt.ylabel("Jumlah Data")
plt.show()

In [None]:
data_negative = df[df['sentimen'] == 'negatif']
data_positive = df[df['sentimen'] == 'positif']

In [None]:
# menampilkan kata dominan positive seperti gojek, suka, baik, bikin
all_text_s1 = ' '.join(str(word) for word in data_positive["content"])
wordcloud = WordCloud(colormap = 'Greens', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s1)
plt.figure(figsize=(6, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Kata Positive')
plt.margins(x=0, y=0)
plt.show()

In [None]:
# menampilkan kata dominan negative seperti gojek, gak, kecewa, padahal
all_text_s1 = ' '.join(str(word) for word in data_negative["content"])
wordcloud = WordCloud(colormap = 'Reds', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_s1)
plt.figure(figsize=(6, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Kata Negative')
plt.margins(x=0, y=0)
plt.show()

In [None]:
x = df.content
y = df.sentimen

In [None]:
#test split data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

cvec=CountVectorizer()
tvec=TfidfVectorizer()
hvec=HashingVectorizer()

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# dan dibungkus pada variabel clf1, 2, dan 3
clf1 = SVC(kernel = "linear") #SVM
clf2 = MultinomialNB()  #naive bayes
clf3 = RandomForestClassifier() #Random Forest

In [None]:
# from sklearn.pipeline import Pipeline

# model1 = Pipeline([('vevtorizer', tvec)
#                   , 'classifier', clf1])
# model2 = Pipeline([('vevtorizer', tvec)
#                   , 'classifier', clf2])
# model3 = Pipeline([('vevtorizer', tvec)
#                   , 'classifier', clf3])

from sklearn.pipeline import Pipeline

model1 = Pipeline([('vevtorizer', tvec), ('classifier', clf1)])
model2 = Pipeline([('vevtorizer', tvec), ('classifier', clf2)])
model3 = Pipeline([('vevtorizer', tvec), ('classifier', clf3)])


In [None]:

x_train = x_train.fillna('')  # Ganti nilai NaN di kolom 'content' dengan string kosong

model1.fit(x_train, y_train)


In [None]:
# Ganti nilai NaN di kolom 'content' dengan string kosong
x_test = x_test.fillna('')

hasil1 = model1.predict(x_test)

In [None]:
matrix = classification_report(y_test, hasil1)
print('Classification Report:\n', matrix)

In [None]:
model2.fit(x_train, y_train)

In [None]:
hasil2=model2.predict(x_test)

In [None]:
matrix = classification_report(y_test, hasil2)
print('Classification Report:\n', matrix)

In [None]:
model3.fit(x_train, y_train)

In [None]:

hasil3=model3.predict(x_test)

In [None]:
matrix = classification_report(y_test, hasil3)
print('Classification Report:\n', matrix)

In [None]:
#mengkompare 3 algoritma
hasilSVC = accuracy_score(hasil1, y_test)
precision1 = precision_score(y_test, hasil1, average='weighted')
recall1 = recall_score(y_test, hasil1, average='weighted')
f1_1 = f1_score(y_test, hasil1, average='weighted')

hasilMultinomialNB = accuracy_score(hasil2, y_test)
precision2 = precision_score(y_test, hasil2, average='weighted')
recall2 = recall_score(y_test, hasil2, average='weighted')
f1_2 = f1_score(y_test, hasil2, average='weighted')

hasilRandomForestClassifier = accuracy_score(hasil3, y_test)
precision3 = precision_score(y_test, hasil3, average='weighted')
recall3 = recall_score(y_test, hasil3, average='weighted')
f1_3 = f1_score(y_test, hasil3, average='weighted')

In [None]:
model = {'Model':['SVC',
                  'Multinomial Naive Bayes',
                  'Random Forest Classifier'
                  ],
         'AccracyScore':[hasilSVC, hasilMultinomialNB, hasilRandomForestClassifier],
         'Precision':[precision1, precision2, precision3],
         'Recall':[recall1, recall2, recall3],
         'F1-score':[f1_1, f1_2, f1_3]
         }
model_df = pd.DataFrame(model)
model_df

In [None]:
# def classify_text(input_text):
#   models = {
#       'DecisionTreeClassifier': model_decisiontree,
#       'MultinomialNB': model_multinomialNaiveBayes,
#       'RandomForestClassifier': model_randomForest
#   }

#   results = {}
#   for name, model in models:
#     prediction = model.predict([input_text])
#     results[name] = prediction[0]

#   return results

def classify_text(input_text):
  models = {
      'svc': model_SVC,
      'MultinomialNB': model_multinomialNaiveBayes,
      'RandomForestClassifier': model_randomForest
  }

  results = {}
  # Iterate through the items (key-value pairs) of the dictionary
  for name, model in models.items(): # This line has been changed
    prediction = model.predict([input_text])
    results[name] = prediction[0]

  return results

In [None]:
model_SVC = model1.fit(x_train, y_train)
model_multinomialNaiveBayes = model2.fit(x_train, y_train)
model_randomForest = model3.fit(x_train, y_train)

In [None]:
input_text = input("Masukan kata yang mau diklasifikasi")

results = classify_text(input_text)

print("Input kata :", input_text)
print("\nHasil Klasifikasi :")
for model, prediction in results.items():
  print(f"{model}: {prediction}")