In [1]:
# Import libraries

import pandas as pd
import nltk
import numpy as np
from nltk.tokenize import PunktSentenceTokenizer,RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
import warnings
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

In [2]:
#to ignore deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [3]:
df = pd.read_csv("./processed_country.csv", encoding='latin-1')

In [4]:
df.head()

Unnamed: 0,name,continent,review,terrain,Zones
0,armenia,europe,"It might be a small nation, but Armenia is bi...",Armenian Highland with mountains; little fores...,Non listed third country
1,czechrepublic,europe,"A historic jewel at the heart of Europe, the ...",Bohemia in the west consists of rolling plains...,EU
2,chad,africa,From the natural wonders of the Sahara Desert ...,"broad, arid plains in center, desert in north,...",Non listed third country
3,cyprus,europe,"The legendary birthplace of Aphrodite, Cyprus ...",central plain with mountains to north and sout...,EU
4,ecuador,south-america,"Tiny by South American standards, Ecuador is ...","coastal plain (costa), inter-Andean central hi...",Non listed third country


In [5]:
df["text"] = df["review"]

In [11]:
df = df[["name","continent","terrain", "Zones"]]

In [12]:
df

Unnamed: 0,name,continent,terrain,Zones
0,armenia,europe,Armenian Highland with mountains; little fores...,Non listed third country
1,czechrepublic,europe,Bohemia in the west consists of rolling plains...,EU
2,chad,africa,"broad, arid plains in center, desert in north,...",Non listed third country
3,cyprus,europe,central plain with mountains to north and sout...,EU
4,ecuador,south-america,"coastal plain (costa), inter-Andean central hi...",Non listed third country
...,...,...,...,...
194,yemen,middle-east,narrow coastal plain backed by flat-topped hil...,Non listed third country
195,zambia,africa,mostly high plateau with some hills and mountains,Non listed third country
196,zimbabwe,africa,mostly high plateau with higher central platea...,Non listed third country
197,northernireland,europe,mostly level to rolling interior plain surroun...,EU with tighter requirements


In [13]:
#Lowercasing
df["lower_text"]= df["terrain"].apply(lambda x : x.lower()) 

In [14]:
# tokenize
tokenizer = RegexpTokenizer(r'\w+')
df["tokenized_text"] = df["lower_text"].apply(lambda row: tokenizer.tokenize(row))

In [15]:
# lemmatize 
wnl = nltk.WordNetLemmatizer()

In [16]:
def lem(lst):
    list1=list()
    for i in lst : 
        list1.append(wnl.lemmatize(i))
    return list1

In [17]:
df["lemmatized_text"]=df["tokenized_text"].apply(lambda x : lem(x))

In [18]:
df.iloc[0]["lower_text"]

'armenian highland with mountains; little forest land; fast flowing rivers; good soil in aras river valley'

In [19]:
df.iloc[0]["tokenized_text"]

['armenian',
 'highland',
 'with',
 'mountains',
 'little',
 'forest',
 'land',
 'fast',
 'flowing',
 'rivers',
 'good',
 'soil',
 'in',
 'aras',
 'river',
 'valley']

In [20]:
df.iloc[0]["lemmatized_text"]

['armenian',
 'highland',
 'with',
 'mountain',
 'little',
 'forest',
 'land',
 'fast',
 'flowing',
 'river',
 'good',
 'soil',
 'in',
 'ara',
 'river',
 'valley']

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

my_additional_stop_words = ['acute', 'good', 'great', 'really', 'just', 'nice', 
                            'like', 'day', 'beautiful', 'visit', 'time', 'don',
                            'did', 'place', 'didn', 'did', 'tour', 'sydney','pm', 'the',
                            'lot', '00', 'inside', 'doesn','going','mostly', 'origin',
                            'right', '15']
stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)


tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words, strip_accents='unicode', norm='l2',lowercase=True)

In [30]:
X=[" ".join(text) for text in df["lemmatized_text"].values]
tfidf_matrix=tfidf_vectorizer.fit_transform(X)

In [31]:
tfidf_matrix.shape

(199, 409)

In [34]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [35]:
# Create a pandas series with movie titles as indices and indices as series values 
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [36]:
indices

name
armenia              0
czechrepublic        1
chad                 2
cyprus               3
ecuador              4
                  ... 
yemen              194
zambia             195
zimbabwe           196
northernireland    197
guam               198
Length: 199, dtype: int64

In [48]:
title='zambia'

# Get the index corresponding to movie title
index = indices[title]

# Get the cosine similarity scores 
similarity_scores = list(enumerate(similarity_matrix[index]))


# Sort the similarity scores in descending order
sorted_similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)


# Top-10 most similar movie scores
top_10_movies_scores = sorted_similarity_scores[1:11]

In [54]:
# Get movie indices
top_10_movie_indices=[]
for i in top_10_movies_scores:
    top_10_movie_indices.append(i[0])
    
# Top 10 recommende movie
df['name'].iloc[top_10_movie_indices]

114               mexico
196             zimbabwe
44                 china
105           madagascar
98               lesotho
33               algeria
120           mozambique
179    trinidadandtobago
181               turkey
95                  laos
Name: name, dtype: object

In [60]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tf_idf[-1], tf_idf[0:199]).flatten()

In [61]:
df['similarity2']=cosine_similarities

In [62]:
df.sort_values(by='similarity2',ascending=False)

Unnamed: 0,name,continent,terrain,Zones,lower_text,tokenized_text,lemmatized_text,similarity,similarity2
198,guam,oceania,"volcanic origin, surrounded by coral reefs; re...",Listed third country,"volcanic origin, surrounded by coral reefs; re...","[volcanic, origin, surrounded, by, coral, reef...","[volcanic, origin, surrounded, by, coral, reef...",1.000000,1.000000
71,guatemala,north-america,mostly mountains with narrow coastal plains an...,Non listed third country,mostly mountains with narrow coastal plains an...,"[mostly, mountains, with, narrow, coastal, pla...","[mostly, mountain, with, narrow, coastal, plai...",0.219783,0.356362
191,vanuatu,oceania,mostly mountainous islands of volcanic origin;...,Listed third country,mostly mountainous islands of volcanic origin;...,"[mostly, mountainous, islands, of, volcanic, o...","[mostly, mountainous, island, of, volcanic, or...",0.205215,0.245828
41,fiji,oceania,mostly mountains of volcanic origin,Listed third country,mostly mountains of volcanic origin,"[mostly, mountains, of, volcanic, origin]","[mostly, mountain, of, volcanic, origin]",0.189742,0.198160
56,dominica,caribbean,rugged mountains of volcanic origin,Non listed third country,rugged mountains of volcanic origin,"[rugged, mountains, of, volcanic, origin]","[rugged, mountain, of, volcanic, origin]",0.170735,0.160597
...,...,...,...,...,...,...,...,...,...
94,kyrgyzstan,asia,peaks of Tien Shan and associated valleys and ...,Non listed third country,peaks of tien shan and associated valleys and ...,"[peaks, of, tien, shan, and, associated, valle...","[peak, of, tien, shan, and, associated, valley...",0.000000,0.000000
87,japan,asia,mostly rugged and mountainous,Listed third country,mostly rugged and mountainous,"[mostly, rugged, and, mountainous]","[mostly, rugged, and, mountainous]",0.000000,0.000000
75,haiti,caribbean,mostly rough and mountainous,Non listed third country,mostly rough and mountainous,"[mostly, rough, and, mountainous]","[mostly, rough, and, mountainous]",0.000000,0.000000
39,bhutan,asia,mostly mountainous with some fertile valleys a...,Non listed third country,mostly mountainous with some fertile valleys a...,"[mostly, mountainous, with, some, fertile, val...","[mostly, mountainous, with, some, fertile, val...",0.000000,0.000000


In [71]:
a = df.sort_values(by=['Zones','similarity2'],ascending=False)

In [73]:
a[a.Zones == "Non listed third country"].head(20)

Unnamed: 0,name,continent,terrain,Zones,lower_text,tokenized_text,lemmatized_text,similarity,similarity2
71,guatemala,north-america,mostly mountains with narrow coastal plains an...,Non listed third country,mostly mountains with narrow coastal plains an...,"[mostly, mountains, with, narrow, coastal, pla...","[mostly, mountain, with, narrow, coastal, plai...",0.219783,0.356362
105,madagascar,africa,"narrow coastal plain, high plateau and mountai...",Non listed third country,"narrow coastal plain, high plateau and mountai...","[narrow, coastal, plain, high, plateau, and, m...","[narrow, coastal, plain, high, plateau, and, m...",0.153128,0.334048
91,kiribati,oceania,mostly low-lying coral atolls surrounded by ex...,Non listed third country,mostly low-lying coral atolls surrounded by ex...,"[mostly, low, lying, coral, atolls, surrounded...","[mostly, low, lying, coral, atoll, surrounded,...",0.104268,0.276476
30,brazil,south-america,mostly flat to rolling lowlands in north; some...,Non listed third country,mostly flat to rolling lowlands in north; some...,"[mostly, flat, to, rolling, lowlands, in, nort...","[mostly, flat, to, rolling, lowland, in, north...",0.10343,0.25598
11,anguilla,caribbean,flat and low-lying island of coral and limestone,Non listed third country,flat and low-lying island of coral and limestone,"[flat, and, low, lying, island, of, coral, and...","[flat, and, low, lying, island, of, coral, and...",0.087336,0.25446
16,belize,north-america,"flat, swampy coastal plain; low mountains in s...",Non listed third country,"flat, swampy coastal plain; low mountains in s...","[flat, swampy, coastal, plain, low, mountains,...","[flat, swampy, coastal, plain, low, mountain, ...",0.139608,0.238954
60,comoros,africa,"volcanic islands, interiors vary from steep mo...",Non listed third country,"volcanic islands, interiors vary from steep mo...","[volcanic, islands, interiors, vary, from, ste...","[volcanic, island, interior, vary, from, steep...",0.115062,0.236615
76,honduras,north-america,"mostly mountains in interior, narrow coastal p...",Non listed third country,"mostly mountains in interior, narrow coastal p...","[mostly, mountains, in, interior, narrow, coas...","[mostly, mountain, in, interior, narrow, coast...",0.126543,0.231223
111,marshallislands,oceania,low coral limestone and sand islands,Non listed third country,low coral limestone and sand islands,"[low, coral, limestone, and, sand, islands]","[low, coral, limestone, and, sand, island]",0.083605,0.231055
156,seychelles,africa,"Mahe Group is granitic, narrow coastal strip, ...",Non listed third country,"mahe group is granitic, narrow coastal strip, ...","[mahe, group, is, granitic, narrow, coastal, s...","[mahe, group, is, granitic, narrow, coastal, s...",0.090945,0.230914


In [None]:

def generate_recomendations(df,film_name,input_films_rated,top_results=5,cat=categories):
    print("Movie Recommender by B.Kurka:")
    print("User name: " + "Favorite Movie:", film_name+'\n\n')
    print("Films you might enjooy based that you watched", film_name)
    ## Item Rating Based Cosine Similarity
    cos_sim = item_based_recom(df,film_name)
    display(cos_sim[1:top_results+1])
    
    print("***********************************************************************************************\n")
    print("Films you might enjooy with similar genre then", film_name) 
    display(item_and_genre_based_recom(item_based_recom(df,film_name),movies,categories)\
            .sort_values('cosine_sim',ascending=False)[top_results:]\
            .sort_values('genre_similarity',ascending=False)[:top_results])
   
    print("***********************************************************************************************\n")
    print("Flims reccomended for you:")
    a = user_based_recom(df,input_films_rated,user_id)[0:25]
    
    display(user_based_recom(df,input_films_rated,user_id)[0:5])
#     display()

    return None
    
generate_recomendations(df,film_name,films_rated,5)