In [6]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [7]:
data = pd.read_csv('journal_scrape.csv')
data

Unnamed: 0,pdf_article_link,article_name,abstract
0,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PENGARUH TIPOLOGI KELUARGA DAN LINGKUNGAN RAMA...,Keluarga dan lingkungan keluarga yang mendukun...
1,https://jurnal.ipb.ac.id/index.php/jikk/articl...,"HARAPAN ORANG TUA, EFIKASI DIRI AKADEMIK, DAN ...",Fenomena tingginya angka putus sekolah pada je...
2,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERAN KELUARGA DAN TOLERANSI AMBIGUITAS PADA E...,Pengaruh keluarga serta toleransi ambiguitas b...
3,https://jurnal.ipb.ac.id/index.php/jikk/articl...,THE ROLE OF MORALLY RELEVANT THEORY OF MIND AN...,Prosocial lying refers to deceptive behavior p...
4,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS RIWAYAT PENGASUHAN PADA TINGKAT POSTT...,Pandemi Covid-19 memberikan ancaman serius bag...
...,...,...,...
313,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS BEBAN KERJA IBU DAN PENGASUHAN ANAK U...,The objective of study was to study mother wor...
314,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERAN STIMULASI ORANGTUA TERHADAP PERKEMBANGAN...,Parents have an important role in childcare. N...
315,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS MANFAAT DAN KEPUASAN PESERTA WANITA P...,The objectives of the study were: to identify ...
316,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERILAKU KONSUMSI SUSU PADA IBU HAMIL,Pregnant woman with her physiological state is...


# TF-IDF

In [8]:
nltk.download('stopwords')
factory = StopWordRemoverFactory()

# Load Stopwords
en_stop_words = stopwords.words('english')
id_stop_words = factory.get_stop_words()
stop_words = en_stop_words + id_stop_words

def remove_stopwords_and_numbers(text):
    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords and return the cleaned text
    cleaned_text = ' '.join(word for word in words if word.lower() not in stop_words)
    return cleaned_text

def paragraph_to_sentences(cleaned_text) :
    sentences = sent_tokenize(cleaned_text)
    return sentences

def get_tfidf(sentences) :
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_info = tfidf_vectorizer.fit_transform(sentences)
    
    feature_names = tfidf_vectorizer.get_feature_names_out()
    tfidf_values = tfidf_info.toarray()
    return feature_names,tfidf_values

def agg_tfidf(feature_names,tfidf_values) :
    agg_tfidf_values = []
    for i in range (len(tfidf_values[0])) :
        array_i = np.array([sublist[i] for sublist in tfidf_values])
        agg_tfidf_values.append(array_i.mean())
    #   print("This is array for word {} : {}".format(feature_names[i],array_i))

    dicti = {}
    for i in range (len(agg_tfidf_values)) :
        dicti[feature_names[i]] = agg_tfidf_values[i]
    #   print("This is aggregated array for word {} : {}".format(feature_names[i],agg_tfidf_values[i]))
    return dicti

def top_keywords_and_score(dicti) :
    top_three_keys = sorted(dicti, key=dicti.get, reverse=True)[:3]
    top_three_values = sorted(dicti.values(), reverse=True)[:3]
    return top_three_keys,top_three_values

# def get_keywords_score(dicti) :
#     top_three_values = sorted(dicti.values(), reverse=True)[:3]
#     return top_three_values

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dirak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def run(text) :
    cleaned_text = remove_stopwords_and_numbers(text)
    sentences = paragraph_to_sentences(cleaned_text)
    feature_names,tfidf_values = get_tfidf(sentences)
    dicti = agg_tfidf(feature_names,tfidf_values)
    top_three_keys, top_three_values = top_keywords_and_score(dicti)
    return top_three_keys, top_three_values

# def run_keywords_score(text) :
#     cleaned_text = remove_stopwords_and_numbers(text)
#     sentences = paragraph_to_sentences(cleaned_text)
#     feature_names,tfidf_values = get_tfidf(sentences)
#     dicti = agg_tfidf(feature_names,tfidf_values)
#     top_three_keys = get_keywords_score(dicti)
#     return top_three_keys

In [10]:
data['keywords'] = data['abstract'].apply(lambda abstract: (run(str(abstract)))[0])
data['keywords_score'] = data['abstract'].apply(lambda abstract: (run(str(abstract)))[1])

In [11]:
data

Unnamed: 0,pdf_article_link,article_name,abstract,keywords,keywords_score
0,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PENGARUH TIPOLOGI KELUARGA DAN LINGKUNGAN RAMA...,Keluarga dan lingkungan keluarga yang mendukun...,"[keluarga, tipologi, meningkatkan]","[0.26354655131309174, 0.1340543982301991, 0.11..."
1,https://jurnal.ipb.ac.id/index.php/jikk/articl...,"HARAPAN ORANG TUA, EFIKASI DIRI AKADEMIK, DAN ...",Fenomena tingginya angka putus sekolah pada je...,"[akademik, siswa, harapan]","[0.18207556319360202, 0.1500129552644624, 0.10..."
2,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERAN KELUARGA DAN TOLERANSI AMBIGUITAS PADA E...,Pengaruh keluarga serta toleransi ambiguitas b...,"[mahasiswa, penelitian, diri]","[0.12073149312443121, 0.11764117507755378, 0.1..."
3,https://jurnal.ipb.ac.id/index.php/jikk/articl...,THE ROLE OF MORALLY RELEVANT THEORY OF MIND AN...,Prosocial lying refers to deceptive behavior p...,"[lying, prosocial, motom]","[0.09782351503317968, 0.09782351503317968, 0.0..."
4,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS RIWAYAT PENGASUHAN PADA TINGKAT POSTT...,Pandemi Covid-19 memberikan ancaman serius bag...,"[responden, ptg, pengasuhan]","[0.1026234984901215, 0.09602219189404047, 0.09..."
...,...,...,...,...,...
313,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS BEBAN KERJA IBU DAN PENGASUHAN ANAK U...,The objective of study was to study mother wor...,"[mothers, spent, hours]","[0.10543128492081938, 0.10401512217468553, 0.0..."
314,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERAN STIMULASI ORANGTUA TERHADAP PERKEMBANGAN...,Parents have an important role in childcare. N...,"[stimulation, children, parents]","[0.12122953309527922, 0.10685496767686722, 0.0..."
315,https://jurnal.ipb.ac.id/index.php/jikk/articl...,ANALISIS MANFAAT DAN KEPUASAN PESERTA WANITA P...,The objectives of the study were: to identify ...,"[center, fitness, joining]","[0.11710853251190759, 0.11710853251190759, 0.1..."
316,https://jurnal.ipb.ac.id/index.php/jikk/articl...,PERILAKU KONSUMSI SUSU PADA IBU HAMIL,Pregnant woman with her physiological state is...,"[milk, samples, correlation]","[0.1662498252861236, 0.11842394405443306, 0.07..."


In [12]:
data.to_csv('journal_scrape_with_keywords.csv')