In [1]:
import pandas as pd
import numpy as np

#Import data from the clean file 
df = pd.read_csv('group.csv')

#Print the head of the cleaned DataFrame
df.head()

Unnamed: 0,grp_id,grp_name,grp_description,grp_type,city,state
0,1,Ahmadnagar hill station,A,hill station,Ahmadnagar,Maharashtra
1,2,Ahmadnagar temple,B,temple,Ahmadnagar,Maharashtra
2,3,Birbhum natural beuty,C,"hill station , waterfall",Birbhum,West Bengal
3,4,Latur island,D,"island , beach",Latur,Maharashtra
4,5,beautiful beaches,E,"island , beach",Ahmadabad,Gujarat


In [2]:
#Import the original file
orig_df = pd.read_csv('group.csv', low_memory=False)

#df["combcol"]= df["grp_name"].astype(str)+ df["grp_type"].astype(str)+df["state"].astype(str)
df['combcol'] = df[['grp_name', 'grp_type','state']].agg(' '.join, axis=1)
df.head()

Unnamed: 0,grp_id,grp_name,grp_description,grp_type,city,state,combcol
0,1,Ahmadnagar hill station,A,hill station,Ahmadnagar,Maharashtra,Ahmadnagar hill station hill station Maharashtra
1,2,Ahmadnagar temple,B,temple,Ahmadnagar,Maharashtra,Ahmadnagar temple temple Maharashtra
2,3,Birbhum natural beuty,C,"hill station , waterfall",Birbhum,West Bengal,"Birbhum natural beuty hill station , waterfall..."
3,4,Latur island,D,"island , beach",Latur,Maharashtra,"Latur island island , beach Maharashtra"
4,5,beautiful beaches,E,"island , beach",Ahmadabad,Gujarat,"beautiful beaches island , beach Gujarat"


In [3]:
#Import TfIdfVectorizer from the scikit-learn library
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stopwords
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['combcol'] = df['combcol'].fillna('')

#Construct the required TF-IDF matrix by applying the fit_transform method on the overview feature
tfidf_matrix = tfidf.fit_transform(df['combcol'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(20, 43)

In [4]:
# Import linear_kernel to compute the dot product
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
#Construct a reverse mapping of indices and movie titles, and drop duplicate titles, if any
indices = pd.Series(df.index, index=df['grp_name']).drop_duplicates()

In [6]:
# Function that takes in movie title as input and gives recommendations 
def content_recommender(title, cosine_sim=cosine_sim, df=df, indices=indices):
    # Obtain the index of the movie that matches the title
    #idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    # And convert it into a list of tuples as described above
    sim_scores = list(enumerate(cosine_sim[title]))

    # Sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies. Ignore the first movie.
    sim_scores = sim_scores[1:11]

     # Get the movie indices
    group_indices = [i[0] for i in sim_scores]
    similarity_indices = [i[1] for i in sim_scores]
    #print(type(df['grp_name'].iloc[group_indices]))
    # Return the top 10 most similar movies
    newdf = pd.DataFrame() 
    newdf['grp_name'] = df['grp_name'].iloc[group_indices]
    newdf['score']= similarity_indices
    return newdf

In [7]:
#Get recommendations for The Lion King
content_recommender(10)

Unnamed: 0,grp_name,score
13,Panna beach,0.562516
5,Ahmadabad beach,0.490847
7,beautiful lakes,0.242856
4,beautiful beaches,0.239165
9,Muzaffarnagar hill station,0.231489
14,Kaushambi hill station,0.231489
3,Latur island,0.206223
0,Ahmadnagar hill station,0.0
1,Ahmadnagar temple,0.0
2,Birbhum natural beuty,0.0


In [8]:
l = [2,4]
data = pd.DataFrame()
for item in l:
    data = data.append(content_recommender(item))
data

data.sort_values(by=['score'],ascending=False)  
data2 = data.drop_duplicates(subset=['grp_name'], keep="first", inplace=False) 
data2

Unnamed: 0,grp_name,score
11,Nadia hill station,0.492068
0,Ahmadnagar hill station,0.285725
17,maya hill station,0.27803
16,cold hill station,0.274799
9,Muzaffarnagar hill station,0.264418
14,Kaushambi hill station,0.264418
8,Maldah temple,0.240796
7,beautiful lakes,0.155858
1,Ahmadnagar temple,0.0
3,Latur island,0.0


In [9]:
data2.sort_values(by=['score'],ascending=False)  

Unnamed: 0,grp_name,score
11,Nadia hill station,0.492068
5,Ahmadabad beach,0.456309
0,Ahmadnagar hill station,0.285725
17,maya hill station,0.27803
16,cold hill station,0.274799
9,Muzaffarnagar hill station,0.264418
14,Kaushambi hill station,0.264418
8,Maldah temple,0.240796
10,Gorakhpur beach,0.239165
13,Panna beach,0.225522
