In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import swifter
df = pd.read_csv("dataset-baru.csv")
text = df["Abstrak"]
text

0      Sistem  informasi  akademik  (SIAKAD) merupaka...
1      Berjalannya koneksi jaringan komputer dengan l...
2      Web server adalah sebuah perangkat lunak serve...
3      Penjadwalan  kuliah  di  Perguruan  Tinggi  me...
4      Seiring perkembangan teknologi yang ada diduni...
                             ...                        
800    Investasi saham selama ini memiliki resiko ker...
801    Information Retrieval (IR) merupakan pengambil...
802    Klasifikasi citra merupakan proses pengelompok...
803    Identifikasi atribut pejalan kaki merupakan sa...
804    Topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 805, dtype: object

In [2]:
def case_folding(comment):
    comment = comment.lower()
    return comment
text = text.apply(case_folding)
text

0      sistem  informasi  akademik  (siakad) merupaka...
1      berjalannya koneksi jaringan komputer dengan l...
2      web server adalah sebuah perangkat lunak serve...
3      penjadwalan  kuliah  di  perguruan  tinggi  me...
4      seiring perkembangan teknologi yang ada diduni...
                             ...                        
800    investasi saham selama ini memiliki resiko ker...
801    information retrieval (ir) merupakan pengambil...
802    klasifikasi citra merupakan proses pengelompok...
803    identifikasi atribut pejalan kaki merupakan sa...
804    topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 805, dtype: object

In [3]:
def linkNormalize(text):
    text = re.sub(r"\s—\s", "", text)
    text = re.sub(r"http\S+", "", text)
    return text
text = text.apply(linkNormalize)
text

0      sistem  informasi  akademik  (siakad) merupaka...
1      berjalannya koneksi jaringan komputer dengan l...
2      web server adalah sebuah perangkat lunak serve...
3      penjadwalan  kuliah  di  perguruan  tinggi  me...
4      seiring perkembangan teknologi yang ada diduni...
                             ...                        
800    investasi saham selama ini memiliki resiko ker...
801    information retrieval (ir) merupakan pengambil...
802    klasifikasi citra merupakan proses pengelompok...
803    identifikasi atribut pejalan kaki merupakan sa...
804    topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 805, dtype: object

In [4]:
symbols = ",!\"#$%&()*+-.…/:;<=>?@[\]^_`{|}~\n0987654321"
def cleaning_text(comment):
    for i in symbols:
        comment = np.char.replace(comment, i, ' ')
    return comment
text = text.apply(cleaning_text)
text

0      sistem  informasi  akademik   siakad  merupaka...
1      berjalannya koneksi jaringan komputer dengan l...
2      web server adalah sebuah perangkat lunak serve...
3      penjadwalan  kuliah  di  perguruan  tinggi  me...
4      seiring perkembangan teknologi yang ada diduni...
                             ...                        
800    investasi saham selama ini memiliki resiko ker...
801    information retrieval  ir  merupakan pengambil...
802    klasifikasi citra merupakan proses pengelompok...
803    identifikasi atribut pejalan kaki merupakan sa...
804    topik deteksi objek telah menarik perhatian ya...
Name: Abstrak, Length: 805, dtype: object

In [5]:
nltk.download('stopwords')
list_stopwords = stopwords.words('indonesian')
def stopword_removal(comment):
    comment = comment.split()
    data = [word for word in comment if word not in list_stopwords]
    kalimat = ""
    for i in data:
      kalimat += i
      kalimat += " "
    return kalimat


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\THORIQ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemmingNormalize(comment):
    comment = comment.split()
    data = []
    for term in comment:
        term = stemmer.stem(term)
        data.append(term)
    kalimat = ""
    for i in data:
      kalimat += i
      kalimat += " "
    return kalimat
text = text.swifter.apply(stemmingNormalize)
text

Pandas Apply:   0%|          | 0/805 [00:00<?, ?it/s]

0      sistem informasi akademik siakad rupa sistem i...
1      jalan koneksi jaring komputer dengan lancar da...
2      web server adalah buah perangkat lunak server ...
3      jadwal kuliah di guru tinggi rupa masalah yang...
4      iring kembang teknologi yang ada dunia muncul ...
                             ...                        
800    investasi saham lama ini milik resiko rugi yan...
801    information retrieval ir rupa ambil informasi ...
802    klasifikasi citra rupa proses kelompok piksel ...
803    identifikasi atribut pejal kaki rupa salah sat...
804    topik deteksi objek telah tarik perhati yang b...
Name: Abstrak, Length: 805, dtype: object

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_separate = tfidf_vectorizer.fit_transform(text)

df_tfidf = pd.DataFrame(
    tfidf_separate.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df.index
)
X = df_tfidf.values
df_tfidf

Unnamed: 0,aalysis,aam,abad,abadi,abai,abdi,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components =10)

X = pca.fit_transform(X)
# X_test = pca.transform(X_test)

pca.explained_variance_ratio_

array([0.02069984, 0.01628373, 0.01330499, 0.01141524, 0.00918442,
       0.00853463, 0.00796232, 0.00748987, 0.00728992, 0.0068521 ])

In [9]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X)

KMeans(n_clusters=2, random_state=42)

In [10]:
center = kmeans.cluster_centers_
center

array([[ 0.31184525,  0.08175944,  0.00830296,  0.05511599, -0.00720064,
        -0.00666104,  0.01277789,  0.02219742, -0.00493274,  0.00885927],
       [-0.04473916, -0.01172969, -0.00119119, -0.00790727,  0.00103305,
         0.00095563, -0.00183319, -0.00318457,  0.00070768, -0.001271  ]])

In [53]:
y_pred = kmeans.labels_


data = kmeans.labels_
res = []
for i in data:
    if i == 0:
        res.append("RPL")
    else:
        res.append("KOMPUTASI")
print(res)

['KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMP

In [54]:
# df['Kategori'].replace("RPL",1, inplace=True)
# df['Kategori'].replace("KOMPUTASI",0, inplace=True)
y_true = df['Kategori'].values.tolist()
print(df['Kategori'].values.tolist())

['RPL', 'RPL', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'RPL', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'KOMPUTASI', 'K

In [42]:
# from sklearn.metrics import accuracy_score
# print ("Accuracy : ",
#     accuracy_score(y_true, y_pred)*100)

In [41]:
# df = pd.read_csv("dataset-baru.csv")
# df['Kategori'].replace("RPL",0, inplace=True)
# df['Kategori'].replace("KOMPUTASI",1, inplace=True)
# y_true = df['Kategori'].values.tolist()
# print(df['Kategori'].values.tolist())

In [44]:
# from sklearn.metrics import accuracy_score
# print ("Accuracy : ",
#     accuracy_score(y_true, y_pred)*100)

In [55]:
n = len(y_true)
for i in range(n):
    print(res[i],"\t : ",y_true[i])

KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUT

KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
RPL 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  RPL
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOMPUTASI 	 :  KOMPUTASI
KOM

In [56]:
from sklearn.metrics import accuracy_score
print ("Accuracy : ",
    accuracy_score(y_true, res)*100)

Accuracy :  53.7888198757764
