In [1]:
# Import libraries

import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
import warnings
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
import re

In [2]:
#to ignore deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
df = pd.read_csv("./processed_country.csv", encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,name,continent,review,terrain,Zones
0,armenia,europe,"It might be a small nation, but Armenia is bi...",Armenian Highland with mountains; little fores...,Non listed third country
1,czechrepublic,europe,"A historic jewel at the heart of Europe, the ...",Bohemia in the west consists of rolling plains...,EU
2,chad,africa,From the natural wonders of the Sahara Desert ...,"broad, arid plains in center, desert in north,...",Non listed third country
3,cyprus,europe,"The legendary birthplace of Aphrodite, Cyprus ...",central plain with mountains to north and sout...,EU
4,ecuador,south-america,"Tiny by South American standards, Ecuador is ...","coastal plain (costa), inter-Andean central hi...",Non listed third country


In [5]:
df["terrain"].apply(lambda x: " ".join(re.findall("[a-zA-Z]*", x)).lower())

0      armenian  highland  with  mountains   little  ...
1      bohemia  in  the  west  consists  of  rolling ...
2      broad   arid  plains  in  center   desert  in ...
3      central  plain  with  mountains  to  north  an...
4      coastal  plain   costa    inter  andean  centr...
                             ...                        
192    narrow  coastal  plain  backed  by  flat  topp...
193    mostly  high  plateau  with  some  hills  and ...
194    mostly  high  plateau  with  higher  central  ...
195    mostly  level  to  rolling  interior  plain  s...
196    volcanic  origin   surrounded  by  coral  reef...
Name: terrain, Length: 197, dtype: object

In [6]:
# tokenize
tokenizer = RegexpTokenizer(r'\w+')
df["tokenized_text"] = df["terrain"].apply(lambda row: tokenizer.tokenize(row))

In [7]:
# lemmatize 
wnl = nltk.WordNetLemmatizer()

In [8]:
def lem(lst):
    list1=list()
    for i in lst : 
        list1.append(wnl.lemmatize(i))
    return list1

In [9]:
df["lemmatized_text"]=df["tokenized_text"].apply(lambda x : lem(x))

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

my_additional_stop_words = ['acute', 'good', 'great', 'really', 'just', 'nice', 
                            'like', 'day', 'beautiful', 'visit', 'time', 'don',
                            'did', 'place', 'didn', 'did', 'tour', 'sydney','pm', 'the',
                            'lot', '00', 'inside', 'doesn','going','mostly', 'origin',
                            'right', '15']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)


tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, strip_accents='unicode', norm='l2',lowercase=True)

In [11]:
X=[" ".join(text) for text in df["lemmatized_text"].values]
tfidf_matrix=tfidf_vectorizer.fit_transform(X)

In [12]:
tfidf_matrix.shape

(197, 413)

In [13]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [14]:
# Create a pandas series with movie titles as indices and indices as series values 
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [15]:
indices

name
armenia              0
czechrepublic        1
chad                 2
cyprus               3
ecuador              4
                  ... 
yemen              192
zambia             193
zimbabwe           194
northernireland    195
guam               196
Length: 197, dtype: int64

In [16]:
title='germany'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(similarity_matrix[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:11]

In [17]:
# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(i[0])
    
# Top 10 recommende movie
df['name'].iloc[top_10_movie_indices]

120     mozambique
2             chad
42        bulgaria
26         estonia
79           India
159       slovakia
132           oman
170    switzerland
27          france
103     luxembourg
Name: name, dtype: object