In [1]:
# Import libraries

import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
import re

In [2]:
df = pd.read_csv("./processed_country.csv", encoding='latin-1')

In [3]:
df.head()

Unnamed: 0,name,continent,review,terrain,Zones
0,armenia,europe,"It might be a small nation, but Armenia is bi...",Armenian Highland with mountains; little fores...,Non listed third country
1,czechrepublic,europe,"A historic jewel at the heart of Europe, the ...",Bohemia in the west consists of rolling plains...,EU
2,chad,africa,From the natural wonders of the Sahara Desert ...,"broad, arid plains in center, desert in north,...",Non listed third country
3,cyprus,europe,"The legendary birthplace of Aphrodite, Cyprus ...",central plain with mountains to north and sout...,EU
4,ecuador,south-america,"Tiny by South American standards, Ecuador is ...","coastal plain (costa), inter-Andean central hi...",Non listed third country


In [4]:
df["name"] = df["name"].apply(lambda x: x.lower())
df["text"] = df["review"] + " " + df["terrain"] + " " + df["continent"]
df["text"].apply(lambda x: " ".join(re.findall("[a-zA-Z]*", x)).lower())

0       it  might  be  a  small  nation   but  armeni...
1       a  historic  jewel  at  the  heart  of  europ...
2      from  the  natural  wonders  of  the  sahara  ...
3      the  legendary  birthplace  of  aphrodite   cy...
4       tiny  by  south  american  standards   ecuado...
                             ...                        
192     if  it  were  an  authentic  taste  of  arabi...
193    vast  lakes  and  wetlands   long  and  life  ...
194    after  a  difficult  decade  or  so   stabilit...
195    northern  ireland  distils  the  best  of  bri...
196    guam  is  a  vibrant   tropical  paradise   co...
Name: text, Length: 197, dtype: object

In [5]:
# tokenize
tokenizer = RegexpTokenizer(r'\w+')
df["tokenized_text"] = df["text"].apply(lambda row: tokenizer.tokenize(row))

In [6]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Harry\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
# lemmatize 
wnl = nltk.WordNetLemmatizer()
def lem(lst):
    list1=list()
    for i in lst : 
        list1.append(wnl.lemmatize(i))
    return list1

df["lemmatized_text"]=df["tokenized_text"].apply(lambda x : lem(x))

In [8]:
# preparation for stopwords to be used in TfidfVectorizer

my_additional_stop_words = ['acute', 'good', 'great', 'really', 'just', 'nice', 
                            'like', 'day', 'beautiful', 'visit', 'time', 'don',
                            'did', 'place', 'didn', 'did', 'tour', 'sydney','pm', 'the',
                            'lot', '00', 'inside', 'doesn','going','mostly', 'origin',
                            'right', '15']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, strip_accents='unicode', norm='l2',lowercase=True)

In [9]:
X=[" ".join(text) for text in df["lemmatized_text"].values]
tfidf_matrix=tfidf_vectorizer.fit_transform(X)
tfidf_matrix.shape

(197, 8326)

In [10]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
# Create a pandas series with countries as indices and indices as series values 
indices = pd.Series(df.index, index=df['name']).drop_duplicates()
indices

name
armenia              0
czechrepublic        1
chad                 2
cyprus               3
ecuador              4
                  ... 
yemen              192
zambia             193
zimbabwe           194
northernireland    195
guam               196
Length: 197, dtype: int64

In [12]:
title='india'

# Get the index corresponding to country name
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(similarity_matrix[index]))

# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

# Top-10 most similar country scores
top_10_country_scores = sorted_similarity_scores[1:11]

In [13]:
# Get movie indices
top_10_country_indices=[]
for i in top_10_country_scores:
    top_10_country_indices.append(i[0])
    
# Top 10 recommende movie
df['name'].iloc[top_10_country_indices]

167       srilanka
124          nepal
163    southafrica
121        myanmar
133       pakistan
39          bhutan
164     southkorea
68           ghana
44           china
16          belize
Name: name, dtype: object