## Import Libraries

In [31]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
import re
import random
# from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

2023-06-02 18:46:22.331815: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-02 18:46:22.546086: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
course = pd.read_csv('course.csv')
prob = pd.read_csv('problemstate.csv')

In [4]:
course.head()

Unnamed: 0,courseID,judul,deskripsi,dampak,modul
0,1,Fundamental Course (1),Fundamental Attitude toward sustainability,mengubah paradigma mengenai sustainability; Me...,Perubahan menuju sustainable mindset; 5 Prinsi...
1,2,Fundamental Course (2),Innovation method as problem solving,Memahami konsep Growth Mindset; Memahami konse...,Growth mindset; Design thinking; Lean Startup;...
2,3,Fundamental Course (3),The way to build sustainable business,Mengenal Sustainable Startup; Memahami SDGs po...,Mengenal Sustainable Startup; SDGs 12 Responsi...
3,4,Indonesia Sustainability Coral Reef University...,Terumbu karang menutupi kurang dari 1% wilayah...,pelindungan habibat; mengurangi polusi; mengur...,Sustainability Leadership; Coral Reef Ecology;...
4,5,Ecotourism,Seiring dengan pelonggaran perjalanan dan stab...,Pelestarian ekosistem terhadap biodiversitas; ...,Ecoturism Introduction; Sustainable Tourism Pr...


In [6]:
course.drop(columns='courseID', axis=1, inplace=True)

In [5]:
prob.head()

Unnamed: 0,desa,problem,lokasi
0,desa pinogu,Budidaya dan pengolahan kopi,gorontalo
1,desa pinogu,Pengolahan produk dari sapi,gorontalo
2,desa pinogu,Akses jalan yang buruk,gorontalo
3,desa pinogu,Listrik yang kurang menyeluruh,gorontalo
4,desa pinogu,Jaringan internet yang kurang lancar,gorontalo


## Preprocessing

In [7]:
# periksa sampel teks
def sample_text(df, idx):
    for col in df.columns:
        print(f'{col}:')
        print(df[col].iloc[idx])
        print('\n')
        
sample_text(course, 3)

judul:
Indonesia Sustainability Coral Reef University Network (ISCORE)


deskripsi:
Terumbu karang menutupi kurang dari 1% wilayah lautan, tetapi mereka secara langsung mendukung jutaan orang dengan
menyediakan makanan, pekerjaan, dan sumber daya lainnya. Bahkan lebih banyak orang yang mendapat manfaat dari
terumbu karang secara tidak langsung; ekosistem ini membantu memberi makan 1 miliar orang di Asia saja.
Populasi manusia hampir 7 miliar orang, dan kemungkinan akan tumbuh menjadi 9 miliar pada tahun 2040. Kita
membutuhkan terumbu karang (dan ekosistem lainnya) untuk memasok lebih banyak sumber daya untuk mendukung
jumlah kita yang terus bertambah, tetapi mereka semakin terancam dengan kehancuran .


dampak:
pelindungan habibat; mengurangi polusi; mengurangi dampak erosi; promosi pariwisata; meningkatkan komunitas lokal; pemeliharaan agen perubahan berkelanjutan dan perlindungan terumbu karang


modul:
Sustainability Leadership; Coral Reef Ecology; Human threats & Challenges; Integr

In [9]:
# agregat tiap masalah pada 1 desa
problem_agg = (prob.groupby(['desa','lokasi'])
          .agg({'problem': lambda x: ", ".join(x)})
          .reset_index())

problem_agg = problem_agg.assign(loc_desa = problem_agg['desa'] + ', ' + problem_agg['lokasi'])
problem_agg = problem_agg.drop(columns=['desa','lokasi'], axis=1)

In [10]:
sample_text(problem_agg, 0)

problem:
Fasilitas penunjang wisata yang masih minim, Wisata kuliner yang terbatas, Akses informasi wisata yang masih terbatas, Faslitas penunjang yang masih terbatas, Belum ada inovasi pengelolaan SDA, seperti makanan sebagai wisata kuliner., Sumber daya manusia yang rendah untuk mengembangkan potensia yang ada.


loc_desa:
desa aimoli, alor




In [11]:
# menggabungkan deskripsi dan dampak pada course menjadi 1 kolom
course2 = course.assign(summary = course['deskripsi'] + ', ' + course['dampak'] + ', ' + course['modul'])
course2 = course2.drop(columns=['deskripsi','dampak','modul'], axis=1)

sample_text(course2, 3)

judul:
Indonesia Sustainability Coral Reef University Network (ISCORE)


summary:
Terumbu karang menutupi kurang dari 1% wilayah lautan, tetapi mereka secara langsung mendukung jutaan orang dengan
menyediakan makanan, pekerjaan, dan sumber daya lainnya. Bahkan lebih banyak orang yang mendapat manfaat dari
terumbu karang secara tidak langsung; ekosistem ini membantu memberi makan 1 miliar orang di Asia saja.
Populasi manusia hampir 7 miliar orang, dan kemungkinan akan tumbuh menjadi 9 miliar pada tahun 2040. Kita
membutuhkan terumbu karang (dan ekosistem lainnya) untuk memasok lebih banyak sumber daya untuk mendukung
jumlah kita yang terus bertambah, tetapi mereka semakin terancam dengan kehancuran ., pelindungan habibat; mengurangi polusi; mengurangi dampak erosi; promosi pariwisata; meningkatkan komunitas lokal; pemeliharaan agen perubahan berkelanjutan dan perlindungan terumbu karang, Sustainability Leadership; Coral Reef Ecology; Human threats & Challenges; Integrating Sustainabilit

### Stemming and Stopword Removal

In [12]:
course_clean = course2.copy()
prob_clean = problem_agg.copy()

In [13]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# menghilangkan simbol
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')

# upload stopword indonesia
f = open("tala_stopword.txt", "r")
stopword_list = []
for line in f:
    stripped_line = line.strip()
    line_list = stripped_line.split()
    stopword_list.append(line_list[0])
f.close()

len(stopword_list)

705

In [14]:
def clean_text(text):
    text = stemmer.stem(text)
    text = text.lower() # lowercase teks
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    text = ' '.join(word for word in text.split() if word not in stopword_list) # hapus stopword dari kolom deskripsi
    return text

In [15]:
course_clean['summary'] = course_clean['summary'].apply(clean_text)
# course_clean['modul'] = course_clean['modul'].apply(clean_text)
prob_clean['problem'] = problem_agg['problem'].apply(clean_text)

In [16]:
sample_text(course_clean, 3)

judul:
Indonesia Sustainability Coral Reef University Network (ISCORE)


summary:
terumbu karang tutup 1 wilayah laut langsung dukung juta orang sedia makan kerja sumber daya bahkan banyak orang manfaat terumbu karang langsung ekosistem bantu makan 1 miliar orang asia populasi manusia 7 miliar orang akan tumbuh 9 miliar 2040 butuh terumbu karang ekosistem pasok banyak sumber daya dukung ancam hancur lindung habibat polusi dampak erosi promosi pariwisata tingkat komunitas lokal pelihara agen ubah lindung terumbu karang sustainability leadership coral reef ecology human threats challenges integrating sustainability in coral reef improving reef management by sustainable financing for communities benefit




In [17]:
sample_text(prob_clean, 0)

problem:
fasilitas tunjang wisata minim wisata kuliner batas akses informasi wisata batas faslitas tunjang batas ada inovasi kelola sda makan bagai wisata kuliner sumber daya manusia rendah kembang potensia ada


loc_desa:
desa aimoli, alor




In [22]:
# set judul course sebagai index
course_clean.set_index('judul', inplace=True)
# set lokasi desa sebagai index
prob_clean.set_index('loc_desa', inplace=True)

In [23]:
def preprocess_text_data(df_col):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(df_col)
    return vectors.toarray()

In [27]:
coba = preprocess_text_data(course_clean['summary'])

In [28]:
coba

array([[0.        , 0.09363028, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.09644434, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.0485924 ,
        0.        ],
       [0.        , 0.05866575, 0.        , ..., 0.        , 0.05242787,
        0.        ]])

In [29]:
coba2 = preprocess_text_data(prob_clean['problem'])

In [30]:
coba2

array([[0.        , 0.15833227, 0.        , 0.        , 0.        ,
        0.        , 0.09359983, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.16613903, 0.        , 0.        , 0.        ,
        0.49841708, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.11126524, 0.        , 0.        , 0.        ,
        0.        , 0.13403992, 0.16613903, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.13403992, 0.        , 0.16613903, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.07916614, 0.11126524, 0.        , 0.        ,
        0.        , 0.33227805, 0.        , 0.  

## Modelling

In [24]:
def create_siamese_network(input_shape, input_shape2):
    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape2)

    embedding = Embedding(input_dim=10000, output_dim=128)  # Modify input_dim based on vocabulary size

    lstm = LSTM(64)

    encoded_a = lstm(embedding(input_a))
    encoded_b = lstm(embedding(input_b))

    # Calculate cosine similarity
    cosine_similarity = Lambda(lambda tensors: K.sum(tensors[0] * tensors[1], axis=1) /
                                       (K.sqrt(K.sum(K.square(tensors[0]), axis=1)) *
                                        K.sqrt(K.sum(K.square(tensors[1]), axis=1))),
                              output_shape=(1,))

    similarity = cosine_similarity([encoded_a, encoded_b])

    siamese_network = Model(inputs=[input_a, input_b], outputs=similarity)

    return siamese_network

def find_nearest_similarity(df1, df2):
    # Preprocess text data
    vectors1 = preprocess_text_data(df1['summary'])
    vectors2 = preprocess_text_data(df2['problem'])

    # Create Siamese network
    input_shape = vectors1.shape[1:]
    input_shape2 = vectors2.shape[1:]
    siamese_network = create_siamese_network(input_shape, input_shape2)

    # Calculate cosine similarity using Siamese network
    similarity_pairs = []

    for i, (index1, row1) in enumerate(df1.iterrows()):
        similarities = []

        for j, (index2, row2) in enumerate(df2.iterrows()):
            pair = (vectors1[i].reshape(1, -1), vectors2[j].reshape(1, -1))
            similarity = siamese_network.predict(pair)
            similarities.append(similarity)

        max_similarity_index = np.argmax(similarities)
        max_similarity = similarities[max_similarity_index]
        similarity_pairs.append((index1, df2.index[max_similarity_index], max_similarity))

    return similarity_pairs


In [26]:
similarity_pairs = find_nearest_similarity(course_clean, prob_clean)

for pair in similarity_pairs:
    index1, index2, similarity = pair
    print("Similarity pair: [{}], [{}], Similarity: {}".format(index1, index2, similarity))

2023-06-02 18:49:02.626589: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-02 18:49:02.627388: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-02 18:49:02.628778: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-06-02 18:49:02.869241: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-02 18:49:02.870718: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-02 18:49:02.871632: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Similarity pair: [Fundamental Course (1)], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Fundamental Course (2)], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Fundamental Course (3)], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Indonesia Sustainability Coral Reef University Network (ISCORE)], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Ecotourism], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Moringa Academy], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Indonesia Sustainable Social Forestry Education Program (IS-FREE)], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Waste Management], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Integrated Farming], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Solar Academy], [desa aimoli, alor], Similarity: [1.]
Similarity pair: [Program Startup], [desa aimoli, alor], Similarity: [1.]
