In [1]:
!python -m pip install --user --upgrade pip

!pip3 install pandas==0.23.4 matplotlib==3.0.3 scipy==1.2.1 scikit-learn==0.22 tensorflow==2.0 keras==1.2.2 --user --quiet

Requirement already up-to-date: pip in /opt/conda/lib/python3.7/site-packages (20.2.3)


In [2]:
!pip3 install kfp --upgrade --user --quiet

In [3]:
# Import Kubeflow SDK
import kfp
import kfp.dsl as dsl
import kfp.components as comp

In [4]:
# where the outputs are stored
out_dir = "https://storage.googleapis.com/micro-access-290111/Hotel_Pipeline"

In [5]:
def train():
    import pandas as pd
    import numpy as np
    from nltk.corpus import stopwords
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import linear_kernel
    from joblib import load,dump
    import sys, subprocess;
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    out_dir = "https://storage.googleapis.com/micro-access-290111/Hotel_Pipeline"
    
    df1 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(1).csv")
    df2 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(2).csv")
    df3 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(3).csv")
    
    df = pd.concat([df1,df2,df3])
    
    class Review_recommender():
        def __init__(self,data=df):
            self.data = data
            self.clean_df = self.data.groupby('Hotel_Name').agg({'Negative_Review':', '.join,'Positive_Review':', '.join}).reset_index()
            self.clean_df['Combined_Review'] = self.clean_df['Positive_Review'].astype(str) + self.clean_df['Negative_Review'].astype(str)
            self.clean_df["Combined_Review"] = self.clean_df["Combined_Review"].apply(lambda x: x.replace("No Negative", "").replace("No Positive", ""))
            self.clean_df['Combined_Review'] = self.clean_df['Combined_Review'].apply(lambda x: " ".join(x.lower() for x in x.split()))
            stop = stopwords.words('english')
            self.clean_df['Combined_Review'] = self.clean_df['Combined_Review'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
            tfv = TfidfVectorizer(min_df=3, max_features =None,strip_accents='unicode',analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1,3))
            self.tfv_matrix = tfv.fit_transform(self.clean_df['Combined_Review'])
            self.clean_df.drop(['Combined_Review','Positive_Review','Negative_Review'],1,inplace=True)
            del tfv,self.data,stop
            self.cos_sim = linear_kernel(self.tfv_matrix, self.tfv_matrix)
            self.indices = pd.Series(self.clean_df.index, index=self.clean_df['Hotel_Name']).drop_duplicates()
        
        def recommend(self,index):
            index = self.indices[index]
            sim_scores = list(enumerate(self.cos_sim[index]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]
            hotel_indices = [i[0] for i in sim_scores]
            for i in self.clean_df['Hotel_Name'].iloc[hotel_indices]:
                print(i)
    
        name = "Hotel Arena"
        recommender = Review_recommender()
        recommender.recommend(name)
        
        dump(recommender,'{}/model/recommender_based_on_reviews.joblib'.format(out_dir))
        print("Recommendation was successful")

In [6]:
def train_b():
    # Importing needed libraries
    import numpy as np
    import pandas as pd
    from ast import literal_eval
    from nltk.corpus import stopwords 
    from nltk.tokenize import word_tokenize
    from nltk.stem.wordnet import WordNetLemmatizer
    
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'pandas==0.23.4'])
    subprocess.run([sys.executable, '-m', 'pip', 'install', 'scikit-learn==0.22'])
    out_dir = "https://storage.googleapis.com/micro-access-290111/Hotel_Pipeline"
    
    df1 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(1).csv")
    df2 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(2).csv")
    df3 = pd.read_csv("https://storage.googleapis.com/micro-access-290111/Hotel_Review_Dataset(3).csv")
    
    df = pd.concat([df1,df2,df3])
    
    
    # Dropping unneeded columns
    df.drop(['Unnamed: 0', 'Additional_Number_of_Scoring',
       'Review_Date','Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score',
       'days_since_review', 'lat', 'lng', 'City', 'tourist',
       'Trip_type', 'Travelling_Status', 'stay_duration', 'room_small',
       'wi_fi', 'air_conditioning', 'breakfast', 'booking_com', 'room_problem',
       'location', 'staff', 'bed_and_room', 'month', 'year'],1,inplace=True)
    

    class Tags_Country_recommender():
        def __init__(self,data=df):
            self.data = data
            # Seperating the tags components to make it clearer
            self.data['Tags'] = self.data['Tags'].apply(lambda x:"".join(literal_eval(x)))
            # Making these columns lowercase
            self.data['Country']=self.data['Country'].str.lower()
            self.data['Tags']=self.data['Tags'].str.lower()
      
        def recommend(self,location,description):   
            # Dividing the texts into small tokens (sentences into words)
            description = description.lower()
            description_tokens=word_tokenize(description)  
            sw = stopwords.words('english')  # List of predefined english stopwords to be used for computing
            lemm = WordNetLemmatizer() # This groups similar words so that it can be analyzed as a single word
        
            # We now define the functions below connecting these imported packages
            filtered_sen = {w for w in description_tokens if not w in sw}
            f_set=set()
            for fs in filtered_sen:
                f_set.add(lemm.lemmatize(fs))
        
        
        # Defining a new variable that takes in the location inputted and bring out the features defined below
        country_feat = self.data[self.data['Country']==location.lower()]
        country_feat = country_feat.set_index(np.arange(country_feat.shape[0]))
        cos=[]
        for i in range(country_feat.shape[0]):
            country_tokens=word_tokenize(country_feat['Tags'][i])
            filtered_set={w for w in country_tokens if not w in sw}
            t_set=set()
            for te in filtered_set:
                t_set.add(lemm.lemmatize(te))
            rvector = t_set.intersection(f_set)
            cos.append(len(rvector))
        country_feat['similarity']=cos
        country_feat=country_feat.sort_values(by='similarity',ascending=False)
        country_feat.drop_duplicates(subset='Hotel_Name',keep='first',inplace=True)
        country_feat.sort_values('Average_Score',ascending=False,inplace=True)
        country_feat.reset_index(inplace=True)
        # Printing top 10 recommendations based on the country and descriptions given
        # Prints out both the hotel name and its location
        for i in range(10):
            print (f'We recommend {country_feat.iloc[i,3]} located at {country_feat.iloc[i,1]}')   
            
        
        recommender = Tags_Country_recommender()
        recommender.recommend('Netherlands','I am going on a business trip, I need a standard room and i am staying for two nights ')
        # Saving the model as a joblib file
        from joblib import load,dump

        dump(recommender,'https://storage.googleapis.com/micro-access-290111/Hotel_Pipeline/model/recommender_based_tags_and_countries.joblib')

In [7]:
# Create train lightweight components.
train_op = comp.func_to_container_op(train , base_image = "tensorflow/tensorflow:latest-gpu-py3")
train_b_op = comp.func_to_container_op(train_b , base_image = "tensorflow/tensorflow:latest-gpu-py3")

In [8]:
import kfp.dsl as dsl
@dsl.pipeline(
   name='Hotel Recommendation pipeline',
   description='A recommender system pipeline that performs recommendation system algorithms.'
)
def reco_container_pipeline():
    train_task = train_op()
    train_b_task = train_b_op()

In [10]:
#Specify pipeline argument values
arguments = out_dir
pipeline_func = reco_container_pipeline
experiment_name = 'hotel_recommender_kubeflow'
run_name = pipeline_func.__name__ + ' run'



# Compile pipeline to generate compressed YAML definition of the pipeline.
kfp.compiler.Compiler().compile(pipeline_func, '{}.zip'.format(experiment_name))

#Submit a pipeline run
kfp.Client(host='6de007513ae0f025-dot-us-central2.pipelines.googleusercontent.com').create_run_from_pipeline_func(reco_container_pipeline, arguments=arguments)

AttributeError: 'str' object has no attribute 'items'