# Load_dataset_module

In [4]:
# Import libraries
import glob
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

def load_dataset(dirpath):   
#     Read directory
    files = glob.glob(dirpath + "/*.csv")
#     print(f"Total datasets: {len(files)}")
#     Load datasets    
    for file in files:
        if file == f"{dirpath}/BX-Users.csv":
            df_user =  pd.read_csv(file, sep = ";", encoding='latin-1', on_bad_lines="skip")
        elif file == f"{dirpath}/BX-Books.csv":
            df_book =  pd.read_csv(file, sep = ";", encoding='latin-1', on_bad_lines="skip")
        else:
            df_rating =  pd.read_csv(file, sep = ";", encoding='latin-1', on_bad_lines="skip")
            
#            Merge dataFrames 
    df_temp = pd.merge(df_user, df_rating, on = "User-ID", how = "inner")
    df = pd.merge(df_temp, df_book, on = "ISBN", how = "inner")
    user_preference =  df[["User-ID", "ISBN", "Book-Title", "Book-Author", "Year-Of-Publication", "Book-Rating"]]    

    return user_preference

# Similarity_module

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import haversine_distances

import warnings
warnings.filterwarnings("ignore")

# Generate the bag_of_words column
def imp_feat(df):
    # Ensure all columns are object
    for col in df.columns:
        df[col] = df[col].astype(str)
        
    # Convert each year(int) to string
    df["Year-Of-Publication"] = df["Year-Of-Publication"].apply(lambda x: str(x))
            
    # Create important features column
    for i in range(0, df.shape[0]):
        df["Bag-Of-Words"] = (
            df[["User-ID", "ISBN", "Book-Title", "Book-Author", "Year-Of-Publication", "Book-Rating"]]
            .apply("".join, axis = 1)
                )
           
    return df

# Calculate similarity using cosine method
def Similarity(df, arg1, arg2, user, method, obs = 100):
    
    # The dataset is too big so I'll be using the n-observations as specified by the user
    df = df.head(obs)
    
    # Create the "Important-Features" the first time this fuction is run
    if "Bag-Of-Words" not in df.columns:
        imp_feat(df)
    
    # Create a class of cosine_similarity
    vectorizer = TfidfVectorizer()
    
    vector = vectorizer.fit_transform(df["Bag-Of-Words"].apply(lambda x: np.str_(x)))
#     similarity = cosine_similarity(vector)
# NEW CHANGE
    try: 
        if method == "cosine_similarity":
            similarity = cosine_similarity(vector)
        elif method == "cosine_distances":
            similarity = cosine_distances(vector)
        elif method == "manhattan":
            similarity = manhattan_distances(vector)
        elif method == "euclidean":
            similarity = euclidean_distances(vector)
        elif method == "haversine":
            similarity = haversine_distances(vector)
        else:
            print("Invalid option")
    except:
        return "Option not available"
    
    # Construct a reverse map of indices and User-ID / ISBN
    if user == "user":
        indices = pd.Series(df.index, index=df['User-ID'])
    else:
        indices = pd.Series(df.index, index=df['ISBN'])
    # Get the index of arguments
    try:
        idx1 = indices[arg1]
        idx2 = indices[arg2]
    except KeyError as e:
        return "Kindly check the parameters provided. It is also possible that you set user to be True while it's False or vice versa" 
    
    # Calcuate score for various arg occurences
    scores = []
    len1 = [1 if len(str(idx1)) == 1 else idx1.shape[0]][0]
    len2 = [1 if len(str(idx2)) == 1 else idx2.shape[0]][0]
    
    if len1 == 1 & len2 == 1:
        score = similarity[idx1][idx2]
    elif len1 == 1 & len2 > 1:
        for i in range(len2):
            sim_score = similarity[idx1][idx2[i]]
            scores.append(sim_score)
            score =   np.mean(scores)
    elif len1 > 1 & len2 == 1:
        for i in range(len1):
            sim_score = similarity[idx1[i]][idx2]
            scores.append(sim_score)
            score =   np.mean(scores)
    else:
        for i in range(len1):
            for j in range(len2):
                sim_score = similarity[idx1[i]][idx2[j]]
                scores.append(sim_score)
                score =   np.mean(scores)
    
    return score

# Recommendation_module

# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import haversine_distances

def get_recommendation(df, target, user, method, obs = 100, num = 10):
    # The dataset is too big so I'll be using the n-observations as specified by the user
    df = df.head(obs)
    
    # Create the "Important-Features" the first time this fuction is run
    if "Bag-Of-Words" not in df.columns:
        imp_feat(df)
    
    # Create an instance of the class TfidfVectorizer()
    vectorizer = TfidfVectorizer()
    
    vector = vectorizer.fit_transform(df["Bag-Of-Words"].apply(lambda x: np.str_(x)))

# Select the similarity to use
    try: 
        if method == "cosine_similarity":
            similarity = cosine_similarity(vector)
        elif method == "cosine_distances":
            similarity = cosine_distances(vector)
        elif method == "manhattan":
            similarity = manhattan_distances(vector)
        elif method == "euclidean":
            similarity = euclidean_distances(vector)
        elif method == "haversine":
            similarity = haversine_distances(vector)
        else:
            print("Invalid option")
    except:
        return "Option not available"
    
    # Construct a reverse map of indices and User-ID / ISBN
    if user == "user":
        indices = pd.Series(df.index, index=df['User-ID'])
    else:
        indices = pd.Series(df.index, index=df['ISBN'])
    # Get the index of arguments
    try:
        idx = indices[target]
    except KeyError as e:
        return "Kindly check the parameters provided. It is also possible that you set user to be True while it's False or vice versa"
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(similarity[idx]))
    return sim_scores

#     # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most items
    return df[target].iloc[item_indices]

# Test_module

In [None]:
# Function to verify input
def verify_input(message):
     while True:
        response = input(message)
        if len(response) < 1:
            print("Input cannot be empty")
            continue
        else:
            return response
    

def test_module():
    print("Welcome")
    while True:
        response = verify_input("Press 1 to check for user preference, 2 for similarity, 3 to for recommendation or  'q' to quit: ")
        if response.lower() == "q":
            return "Bye-bye!"
        try:
            num = int(response)
        except:
            print("Input must be an integer")
            continue
        
        if not 0 < num < 4:
            print("Invalid response")
            continue
            
        num =   str(num)  
        if num == "1":
            dir = verify_input("Paste dataset directory path: ")
            try: 
                print("Working on your input")
                user_preference = load_dataset(dir)
                return user_preference
            except:
                print("Invalid path")
                continue
        elif num == "2":
            dir = verify_input("Paste dataset directory path: ")
            try: 
                print("Working on your input")
                user_preference = load_dataset(dir)
            except:
                print("Invalid path")
                continue
                
            try:
                user = verify_input("What are you comparing? user/book: ")
                if user == "user":
                    target = verify_input("Target user's id: ")
                    other = verify_input("Other user's id: ")
                elif user == "book": 
                    target = verify_input("Target book's ISBN: ")
                    other = verify_input("Other book's ISBN: ")
                else:
                    print("Invalid options")
                    continue
                try:    
                    obs = int(verify_input("Kindly indicate the number of observation to include. This process is time consuming as observation increases: "))
                except:
                    print("Number of observations can only be an integer")
                    continue
                # Print available methods for the user to chose from
                print("""Options include:
                1. cosine_similarity
                2. cosine_distances
                3. manhattan_distances
                4. euclidean_distances
                5. haversine_distances""")
                method = verify_input("Select method: ")
                try:
                    if method.lower() in ["cosine_similarity", "cosine_distances", "manhattan_distances","euclidean_distances", "haversine_distances"]:
                        pass
                    else: int(method)
                except:
                    print("Invalid response")
                    continue
                similarity = Similarity(user_preference, target, other, user, method, obs)
                try:
                    similarity = round(similarity, 2)
                    return f"{method} score: {similarity}"
                except:
                    print(similarity)
                    continue
            except:
                print("Make sure to enter all the inputs correctly")
                continue
                
        # Recommendation        
        else:
            dir = verify_input("Paste dataset directory path: ")
            try: 
                print("Working on your input")
                user_preference = load_dataset(dir)
            except:
                print("Invalid path")
                continue
                
            try:
                user = verify_input("What are you recommending? user/book: ")
                if user == "user":
                    target = verify_input("Target user's id: ")
                elif user == "book": 
                    target = verify_input("Target book's ISBN: ")
                else:
                    print("Invalid option")
                    continue
                # Get total observations to include
                obs = int(verify_input("Kindly indicate the number of observation to include. This process is time consuming as observation increases: "))
                print("""Options include:
                1. cosine_similarity
                2. cosine_distances
                3. manhattan_distances
                4. euclidean_distances
                5. haversine_distances""")
                # Select an option
                method = verify_input("Select method: ")
                try:
                    if method.lower() in ["cosine_similarity", "cosine_distances", "manhattan_distances","euclidean_distances", "haversine_distances"]:
                        pass
                    else: int(method)
                except:
                    print("Invalid response")
                    continue
                try:
                    obs = int(verify_input("How many items should i recommend?"))
                except:
                    print("Number of observations can only be an integer")
                    continue
                # Call recommendation function
                recommendations =  get_recommendation(user_preference, target, user, method, obs, num)
                return recommendations
            except:
                print("Make sure to enter all the inputs correctly")
                continue

# TEST CODE

In [None]:
# test_module()