In [1]:
pip install sastrawi

/bin/bash: /home/anggapark/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [3]:
course = pd.read_csv('course.csv')
# problem = pd.read_csv('problemstate.csv')

In [4]:
course.head()

Unnamed: 0,courseID,judul,deskripsi,dampak,modul
0,1,Fundamental Course (1),Fundamental Attitude toward sustainability,mengubah paradigma mengenai sustainability; Me...,Perubahan menuju sustainable mindset; 5 Prinsi...
1,2,Fundamental Course (2),Innovation method as problem solving,Memahami konsep Growth Mindset; Memahami konse...,Growth mindset; Design thinking; Lean Startup;...
2,3,Fundamental Course (3),The way to build sustainable business,Mengenal Sustainable Startup; Memahami SDGs po...,Mengenal Sustainable Startup; SDGs 12 Responsi...
3,4,Indonesia Sustainability Coral Reef University...,Terumbu karang menutupi kurang dari 1% wilayah...,pelindungan habibat; mengurangi polusi; mengur...,Sustainability Leadership; Coral Reef Ecology;...
4,5,Ecotourism,Seiring dengan pelonggaran perjalanan dan stab...,Pelestarian ekosistem terhadap biodiversitas; ...,Ecoturism Introduction; Sustainable Tourism Pr...


In [5]:
course = course.drop(columns='courseID', axis=1)

In [6]:
# check sample texts
def sample_text(df, idx):
    for col in df.columns[1:]:
        print(f'{col}:')
        print(df[col].iloc[idx])
        print('\n')
        
sample_text(course, 3)

deskripsi:
Terumbu karang menutupi kurang dari 1% wilayah lautan, tetapi mereka secara langsung mendukung jutaan orang dengan
menyediakan makanan, pekerjaan, dan sumber daya lainnya. Bahkan lebih banyak orang yang mendapat manfaat dari
terumbu karang secara tidak langsung; ekosistem ini membantu memberi makan 1 miliar orang di Asia saja.
Populasi manusia hampir 7 miliar orang, dan kemungkinan akan tumbuh menjadi 9 miliar pada tahun 2040. Kita
membutuhkan terumbu karang (dan ekosistem lainnya) untuk memasok lebih banyak sumber daya untuk mendukung
jumlah kita yang terus bertambah, tetapi mereka semakin terancam dengan kehancuran .


dampak:
pelindungan habibat; mengurangi polusi; mengurangi dampak erosi; promosi pariwisata; meningkatkan komunitas lokal; pemeliharaan agen perubahan berkelanjutan dan perlindungan terumbu karang


modul:
Sustainability Leadership; Coral Reef Ecology; Human threats & Challenges; Integrating Sustainability in Coral Reef; Improving reef management by sustaina

In [7]:
course.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   judul      11 non-null     object
 1   deskripsi  11 non-null     object
 2   dampak     11 non-null     object
 3   modul      11 non-null     object
dtypes: object(4)
memory usage: 480.0+ bytes


In [8]:
course2 = course.assign(summary = course['deskripsi'] + ', ' + course['dampak'] + ', ' + course['modul'])
course2 = course2.drop(columns=['deskripsi','dampak', 'modul'], axis=1)

## Stemming & Remove Stopwords

Stemming = mengubah kata imbuhan menjadi kata dasar </br>
Stopwords = Stop list ini berisi daftar kata umum yang mempunyai fungsi tapi tidak mempunyai art

In [9]:
course_clean = course2.copy()

In [10]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# variable to clear symbols
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')

# stopword
f = open("tala_stopword.txt", "r")
stopword_list = []
for line in f:
    stripped_line = line.strip()
    line_list = stripped_line.split()
    stopword_list.append(line_list[0])
f.close()

len(stopword_list)

705

In [11]:
def clean_text(text):
    """
    preprocess and clean texts data
    and perform stemming to text
    input:
      text: text value
    output
      text: text after preprocess
    """
    text = stemmer.stem(text)
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopword_list) # hapus stopword dari kolom deskripsi
    return text

In [12]:
course_clean['summary'] = course_clean['summary'].apply(clean_text)

In [13]:
sample_text(course_clean, 3)

summary:
terumbu karang tutup 1 wilayah laut langsung dukung juta orang sedia makan kerja sumber daya bahkan banyak orang manfaat terumbu karang langsung ekosistem bantu makan 1 miliar orang asia populasi manusia 7 miliar orang akan tumbuh 9 miliar 2040 butuh terumbu karang ekosistem pasok banyak sumber daya dukung ancam hancur lindung habibat polusi dampak erosi promosi pariwisata tingkat komunitas lokal pelihara agen ubah lindung terumbu karang sustainability leadership coral reef ecology human threats challenges integrating sustainability in coral reef improving reef management by sustainable financing for communities benefit




## TF-IDF dan Cosine Similarity

In [14]:
def conv_to_tf_idf(df, set_to_idx, col_transform):
    """
    convert text value into tf-idf matrices
    input:
      df: dataset
      set_to_idx: set 'judul' column into indices
      col_transform: set which column to convert
    output:
      tfidf_matrix: matrix with tf-idf value
    """
    df.set_index(set_to_idx, inplace=True)
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
    tfidf_matrix = tf.fit_transform(col_transform)
    return tfidf_matrix


def conv_to_cosim(tfidf_matrix):
    """
    calculates the cosine similarity between 
    the rows of the TF-IDF matrix 
    input:  
      tfidf_matrix: matrix with tf-idf value
    output:
      cos_sim: cosine similarity matrix
    """
    cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return cos_sim

In [15]:
tfidf_mat = conv_to_tf_idf(course_clean, 'judul', course_clean['summary'])
cos_sim_course = conv_to_cosim(tfidf_mat)

In [16]:
cos_sim_course

array([[1.        , 0.1159454 , 0.04595966, 0.02483439, 0.00500842,
        0.01230051, 0.01183024, 0.053896  , 0.00620651, 0.0330908 ,
        0.03570838],
       [0.1159454 , 1.        , 0.06809778, 0.        , 0.        ,
        0.00651309, 0.        , 0.00399927, 0.        , 0.01433929,
        0.10656933],
       [0.04595966, 0.06809778, 1.        , 0.00540203, 0.03302793,
        0.01310899, 0.02915767, 0.00556084, 0.00806958, 0.04039211,
        0.312342  ],
       [0.02483439, 0.        , 0.00540203, 1.        , 0.03026998,
        0.02630298, 0.00864063, 0.00914667, 0.01229069, 0.00366474,
        0.01404477],
       [0.00500842, 0.        , 0.03302793, 0.03026998, 1.        ,
        0.01928672, 0.01859025, 0.01037204, 0.00901776, 0.01600512,
        0.03063227],
       [0.01230051, 0.00651309, 0.01310899, 0.02630298, 0.01928672,
        1.        , 0.05950171, 0.01562522, 0.04364514, 0.02404054,
        0.03012742],
       [0.01183024, 0.        , 0.02915767, 0.00864063, 0.

In [17]:
indices = pd.Series(course_clean.index)
indices

0                                Fundamental Course (1)
1                                Fundamental Course (2)
2                                Fundamental Course (3)
3     Indonesia Sustainability Coral Reef University...
4                                            Ecotourism
5                                       Moringa Academy
6     Indonesia Sustainable Social Forestry Educatio...
7                                      Waste Management
8                                    Integrated Farming
9                                         Solar Academy
10                                      Program Startup
Name: judul, dtype: object

In [18]:
def recommendations(name, cos_sim = cos_sim_course):
    """
    generates recommendations based on 
    content similarity
    input:
        name: course title
        cos_sim: cosine similarity matrix
    output:
        course_rec: list of sorted recommendation by nearest similarity
    """
    course_rec = []

    # pick course title based on indices variable
    idx = indices[indices == name].index[0]

    # create series based on similarity
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # Extract the indices and generate the top 10 recommendations
    top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_indexes:
        course_rec.append(list(course_clean.index)[i])

    return course_rec

In [19]:
course_title = 'Moringa Academy'
recommendations(course_title)

['Indonesia Sustainable Social Forestry Education Program (IS-FREE)',
 'Integrated Farming',
 'Program Startup',
 'Indonesia Sustainability Coral Reef University Network (ISCORE)',
 'Solar Academy',
 'Ecotourism',
 'Waste Management',
 'Fundamental Course (3)',
 'Fundamental Course (1)',
 'Fundamental Course (2)']