# Recommendation System Item Based Collaborative Filtering

In [1]:
! pip install surprise
! pip install nlp-id

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 5.7 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633975 sha256=5d73c535f7eb4988086ab179c6e33d5da82595dc4156076ec83adbfb4304a68a
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlp-id
  D

Library Import

In [4]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from nlp_id.tokenizer import Tokenizer
from nlp_id.lemmatizer import Lemmatizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords

import nltk
import surprise

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data Loading

Tourism Description dataset.

In [5]:
dataset_path = "tourism_with_id.csv"
data_df = pd.read_csv(dataset_path, delimiter = ',')

data_df.drop(["Unnamed: 11", "Unnamed: 12"], axis = 1, inplace = True)
data_df.head(5)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134


In [6]:
data_df.describe()

Unnamed: 0,Place_Id,Price,Rating,Time_Minutes,Lat,Long
count,437.0,437.0,437.0,205.0,437.0,437.0
mean,219.0,24652.173913,4.442792,82.609756,-7.095438,109.160142
std,126.295289,66446.374709,0.208587,52.872339,0.727241,1.962848
min,1.0,0.0,3.4,10.0,-8.197894,103.931398
25%,110.0,0.0,4.3,45.0,-7.74959,107.578369
50%,219.0,5000.0,4.5,60.0,-7.020524,110.237468
75%,328.0,20000.0,4.6,120.0,-6.829411,110.431869
max,437.0,900000.0,5.0,360.0,1.07888,112.821662


Rating Dataset

In [7]:
rating_dataset_path = "tourism_rating.csv"
rating_data_df = pd.read_csv(rating_dataset_path)

rating_data_df.head(5)

Unnamed: 0,User_Id,Place_Id,Place_Ratings
0,1,179,3
1,1,344,2
2,1,5,5
3,1,373,3
4,1,101,4


Attribute Variables

In [8]:
description_field = "Description"
category_field = "Category"
tokenized_description_field = "Tokenized_Description"

## Data Preprocessing

Lemmatize Text.

In [9]:
indo_lemmatizer = Lemmatizer()

index = 0

tokenized_descriptions = []

for index, row in data_df.iterrows():
  description = row[description_field]
  tokenized_sentence = indo_lemmatizer.lemmatize(description)
  tokenized_descriptions.append(tokenized_sentence)

data_df[tokenized_description_field] = tokenized_descriptions
data_df.head(5)

Unnamed: 0,Place_Id,Place_Name,Description,Category,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Tokenized_Description
0,1,Monumen Nasional,Monumen Nasional atau yang populer disingkat d...,Budaya,Jakarta,20000,4.6,15.0,"{'lat': -6.1753924, 'lng': 106.8271528}",-6.175392,106.827153,monumen nasional atau yang populer singkat den...
1,2,Kota Tua,"Kota tua di Jakarta, yang juga bernama Kota Tu...",Budaya,Jakarta,0,4.6,90.0,"{'lat': -6.137644799999999, 'lng': 106.8171245}",-6.137645,106.817125,kota tua di jakarta yang juga nama kota tua pu...
2,3,Dunia Fantasi,Dunia Fantasi atau disebut juga Dufan adalah t...,Taman Hiburan,Jakarta,270000,4.6,360.0,"{'lat': -6.125312399999999, 'lng': 106.8335377}",-6.125312,106.833538,dunia fantasi atau sebut juga dufan adalah tem...
3,4,Taman Mini Indonesia Indah (TMII),Taman Mini Indonesia Indah merupakan suatu kaw...,Taman Hiburan,Jakarta,10000,4.5,,"{'lat': -6.302445899999999, 'lng': 106.8951559}",-6.302446,106.895156,taman mini indonesia indah rupa suatu kawasan ...
4,5,Atlantis Water Adventure,Atlantis Water Adventure atau dikenal dengan A...,Taman Hiburan,Jakarta,94000,4.5,60.0,"{'lat': -6.12419, 'lng': 106.839134}",-6.12419,106.839134,atlantis water adventure atau kenal dengan atl...


## Data Modelling

Make a Sample Data

In [10]:
sample_description = "Pantai Ancol"
sample_categories = "Taman Hiburan"

Make Machine Learning Model (KNN Recommendation System).

In [11]:
def get_categories(sample_category):
  dataset = data_df
  categories = dataset[category_field]

  count_vectorizer = CountVectorizer()
  vector_components = count_vectorizer.fit_transform(categories)
  sample_category_vector_component = count_vectorizer.transform([sample_category])
  index_to_word_mapping = count_vectorizer.get_feature_names()
  return vector_components, sample_category_vector_component, index_to_word_mapping

def get_descriptions(sample_description):
  dataset = data_df
  descriptions = dataset[description_field]
  
  indonesian_stopwords = stopwords.words("indonesian")
  tf_idf_vectorizer = TfidfVectorizer(stop_words = indonesian_stopwords)
  vector_components = tf_idf_vectorizer.fit_transform(descriptions)
  sample_description_vector_component = tf_idf_vectorizer.transform([sample_description])
  index_to_word_mapping = tf_idf_vectorizer.get_feature_names()
  
  return vector_components, sample_description_vector_component, index_to_word_mapping

category_vector_components, sample_category_vector_component, category_index_to_word_mapping = get_categories("Museum")
description_vector_components, sample_description_vector_component, description_index_to_word_mapping = get_descriptions("Replika")

print(category_index_to_word_mapping[:10])
print(description_index_to_word_mapping[:10])

['alam', 'bahari', 'budaya', 'cagar', 'hiburan', 'ibadah', 'perbelanjaan', 'pusat', 'taman', 'tempat']
['00', '000', '01', '016', '02', '02223', '023', '039', '04', '05']


  'stop_words.' % sorted(inconsistent))


Use K Nearest Neighbor to Make Content Based Recommendation based on Descriptions and Categories.

In [24]:
def recommend_travelling_places_using_knn(sample_description):
  dataset = data_df
  description_vector_components, sample_description_vector_component, index_to_word_mapping = get_descriptions(sample_description)

  nearest_neighbors = NearestNeighbors(n_neighbors = 10)
  nearest_neighbors.fit(description_vector_components)
  k_nearest_neighbors_scores = nearest_neighbors.kneighbors(sample_description_vector_component)
  
  return k_nearest_neighbors_scores

# def get_top_n_recommendations_based_on_similarity_scores():

# recommend_travelling_places_using_knn(sample_description)

  'stop_words.' % sorted(inconsistent))


(array([[1.07617559, 1.09721061, 1.11356706, 1.11740187, 1.13003354,
         1.136962  , 1.14169569, 1.14606522, 1.14845088, 1.16706879]]),
 array([[  8, 152, 167,   5, 190, 153, 187, 201, 199,  55]]))