# Libraries

In [197]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [198]:
# Basic libraries
import numpy as np
import pandas as pd

import itertools

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity
from numpy.linalg import norm

# NLTK - Natural Language toolkit

In [199]:
!python -m pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


A library for NLP.
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [200]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

In [201]:
from google.colab import drive
drive.mount('/content/drive')
PATH = "/content/drive/MyDrive/MachineLearningTP/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [202]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe

def predict_rating(grouped_arr, word_sim_arr):
    ratings_pred = grouped_arr.dot(word_sim_arr) / np.array([np.abs(word_sim_arr).sum(axis=1)])
    return ratings_pred



class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        compile = re.compile("\W+")  # Format
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

                # Post: clear meaningless words
                a = compile.sub(" ", line)  # Clear special character
                self._local_msg_list[i][idx] = a.lower()  # lower character

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            for mv in movies:
                newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
        
        # Fill NaN with empty sentence
        self.dialog_df['dialog'].fillna('', inplace=True)
    

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        # mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()

    def create_evaluation_matrix(self):
        result = []
        for dialog in self.train:
          for respondent in dialog['respondentQuestions']:
            result.append([respondent,
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']+dialog['respondentQuestions'][respondent]['liked'])])
          for intq in dialog['initiatorQuestions']:
            result.append([intq,
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']+dialog['initiatorQuestions'][intq]['liked'])])
        
        test_data = pd.DataFrame(result, columns=["movie_id","suggested","seen","liked"] )
        print(test_data.values[0])
        grouped_data = test_data.groupby('movie_id').mean()
        scaler = StandardScaler()
        grouped_data = pd.DataFrame(scaler.fit_transform(grouped_data), columns = ["suggested","seen","liked"])
        print(grouped_data.values[0])
        reformat = []
        for data in grouped_data.values:
          reformat.append([data[0]*data[1], data[1], data[1]*data[2]])

        test_grouped = pd.DataFrame(reformat, columns= ["suggested","seen","liked"], index = grouped_data.index)

        eval_sim = cosine_similarity(test_grouped, test_grouped)
        test_sim = pd.DataFrame(eval_sim, index = test_grouped.index, columns=test_grouped.index)

        test_pred = predict_rating(test_grouped.transpose().values, test_sim.values)
        test_pred = pd.DataFrame(test_pred, index = test_grouped.transpose().index, columns=test_grouped.transpose().columns)
        test_data = test_pred.transpose()

        return test_data
    

    def similarity(X, Y):
        """
        TODO: Compute the cosine simliarity between X and Y. For avoiding the DivByZero, the denominator has 1e-7 minimum value.

            :arg
                X (numpy.ndarray): X data array
                Y (numpy.ndarray): Y data array
            :return
                float
        """
        return np.dot(X, Y) / ((norm(X) * norm(Y)) + 1e-7)




# Initialize
Import dataset, describe it briefly.

In [203]:
parser = RedialParser(PATH+'dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

length of train dataset: 10006


# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [204]:
parser.preprocessing()
parser.dialog_df

Unnamed: 0,movieid,dialog
0,84779,hi there how are you i m looking for movie re...
1,191602,hi there how are you i m looking for movie re...
2,122159,hi there how are you i m looking for movie re...
3,165710,hi there how are you i m looking for movie re...
4,151313,hi there how are you i m looking for movie re...
...,...,...
64456,204974,what type of movies do you like hi i m looki...
64457,85036,hello hi how can i help you so some of the m...
64458,170277,hello hi how can i help you so some of the m...
64459,149938,hello hi how can i help you so some of the m...


# Tokenization
* 1. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [205]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

Unnamed: 0,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
count,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0
mean,5.840227,1.32523,7.919035,0.204666,0.006469,0.583023,0.990982,0.059792,0.304058,1.089982,3.916095
std,21.739531,10.106137,20.996916,1.584999,0.091594,4.267986,6.156864,0.447819,2.396133,6.209539,19.184717
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,449.0,494.0,380.0,64.0,2.0,175.0,160.0,15.0,81.0,153.0,673.0


* 2. Normal TF-IDF

In [206]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

Unnamed: 0,id,bye,check,comedy,day,did,enjoy,funny,good,great,...,saw,seen,suggestions,sure,thank,thanks,think,ve,watch,yes
0,84779,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
1,191602,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
2,122159,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
3,165710,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
4,151313,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,204974,0.0,0.0,0.0,0.0,0.173777,0.0,0.0,0.242174,0.0,...,0.182008,0.085112,0.297298,0.0,0.0,0.11514,0.0,0.0,0.139028,0.0
64457,85036,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612
64458,170277,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612
64459,149938,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612


* 3. dataframe for collaborative filtering

In [207]:
# use tf_idf matrix for collaborative filtering
df_grouped = df_mv_tfidf.groupby('id').mean()
movie_sim = cosine_similarity(df_grouped, df_grouped)
df_movie_sim = pd.DataFrame(movie_sim, index = df_grouped.index, columns=df_grouped.index)
df_movie_sim

id,100026,100030,100043,100070,100074,100106,100165,100178,100183,100228,...,99809,99812,99824,99887,99896,99910,99955,99966,99975,99998
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,1.000000,0.640556,0.698649,0.635652,0.380980,0.677651,0.368540,0.432183,0.527977,0.302225,...,0.601344,0.506582,0.376885,0.679183,0.643082,0.582633,0.374284,0.382245,0.619037,0.551563
100030,0.640556,1.000000,0.861256,0.852374,0.561998,0.692041,0.511145,0.730180,0.669948,0.598548,...,0.791803,0.591010,0.681935,0.757199,0.884000,0.946926,0.591681,0.652669,0.728449,0.598096
100043,0.698649,0.861256,1.000000,0.728555,0.464854,0.689502,0.558155,0.676466,0.622988,0.361951,...,0.734439,0.519117,0.551773,0.682184,0.825810,0.790133,0.518141,0.628464,0.723657,0.555310
100070,0.635652,0.852374,0.728555,1.000000,0.623706,0.677486,0.448280,0.682786,0.687993,0.434146,...,0.793041,0.564328,0.659829,0.662480,0.879297,0.847523,0.588893,0.532482,0.709021,0.665099
100074,0.380980,0.561998,0.464854,0.623706,1.000000,0.535645,0.463760,0.474134,0.648490,0.485631,...,0.487325,0.713150,0.512147,0.325547,0.596476,0.527345,0.386010,0.317815,0.482237,0.475448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.582633,0.946926,0.790133,0.847523,0.527345,0.663338,0.494794,0.701171,0.677470,0.584791,...,0.765500,0.540917,0.644485,0.738113,0.879484,1.000000,0.591277,0.553885,0.703213,0.544914
99955,0.374284,0.591681,0.518141,0.588893,0.386010,0.614015,0.246867,0.601429,0.432864,0.211761,...,0.701047,0.397917,0.570580,0.450412,0.706223,0.591277,1.000000,0.400968,0.567933,0.350425
99966,0.382245,0.652669,0.628464,0.532482,0.317815,0.485863,0.398811,0.791204,0.410389,0.253259,...,0.462083,0.349901,0.627416,0.600905,0.625149,0.553885,0.400968,1.000000,0.580344,0.368045
99975,0.619037,0.728449,0.723657,0.709021,0.482237,0.676243,0.412914,0.649854,0.648926,0.305637,...,0.655654,0.553286,0.487914,0.590497,0.786145,0.703213,0.567933,0.580344,1.000000,0.440597


In [208]:
# creating dataframe for collaborative filtering
movie_pred = predict_rating(df_grouped.transpose().values, df_movie_sim.values)
df_movie_pred = pd.DataFrame(movie_pred, index = df_grouped.transpose().index, columns=df_grouped.transpose().columns)
collab_data = df_movie_pred.transpose()
collab_data

Unnamed: 0_level_0,bye,check,comedy,day,did,enjoy,funny,good,great,haven,...,saw,seen,suggestions,sure,thank,thanks,think,ve,watch,yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,0.103401,0.081835,0.048197,0.068462,0.063878,0.055221,0.050704,0.244238,0.143483,0.089078,...,0.058011,0.190042,0.070583,0.056314,0.082957,0.094516,0.086508,0.069470,0.093476,0.097901
100030,0.101919,0.080110,0.052971,0.070745,0.061507,0.056437,0.057614,0.236322,0.145211,0.084567,...,0.051266,0.188939,0.069369,0.057754,0.080270,0.096199,0.087264,0.069476,0.089352,0.099339
100043,0.102795,0.078741,0.048672,0.070598,0.062685,0.056423,0.053534,0.240038,0.146550,0.083602,...,0.053079,0.189701,0.068338,0.056519,0.081057,0.095582,0.087821,0.069853,0.088869,0.101065
100070,0.101648,0.081032,0.049612,0.070694,0.061316,0.056027,0.052169,0.236149,0.144898,0.083513,...,0.050863,0.187530,0.069027,0.057165,0.080583,0.096466,0.088147,0.069442,0.090762,0.097415
100074,0.100064,0.083943,0.049436,0.075159,0.059720,0.062930,0.052544,0.230826,0.153084,0.090796,...,0.050598,0.192722,0.071444,0.055824,0.079443,0.099817,0.085871,0.070057,0.090586,0.098584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.101661,0.080728,0.054608,0.070604,0.061595,0.056174,0.058986,0.234050,0.144304,0.084999,...,0.051363,0.190321,0.068702,0.056780,0.080054,0.096306,0.088233,0.070907,0.088887,0.098304
99955,0.110777,0.082274,0.047823,0.072243,0.059380,0.055418,0.050957,0.235460,0.141891,0.082618,...,0.050137,0.187708,0.068159,0.060834,0.079818,0.098505,0.085613,0.068826,0.087933,0.095781
99966,0.105386,0.077434,0.048492,0.070535,0.059641,0.055735,0.052078,0.233342,0.149944,0.082626,...,0.049735,0.187010,0.070110,0.055713,0.082309,0.094496,0.085596,0.067740,0.090966,0.099518
99975,0.104932,0.081638,0.048423,0.068492,0.060820,0.055308,0.051679,0.238746,0.142142,0.087153,...,0.051523,0.189476,0.068428,0.055646,0.079350,0.096542,0.088116,0.069690,0.087340,0.100801


# Similarity Metrics
* Cosine similarity

In [209]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / (1e-7 + norm(X) * norm(Y))

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: consine similarity function
        * default: dot(X, y) / (normalize(X) * normalize(Y) + 1e-7)

In [210]:
def recommend(data, mv, length=5, simf=c_sim):
    sim = []

    if parser.dialog_df.loc[parser.dialog_df['movieid'] == mv].empty:
        return sim
    
    idx = parser.dialog_df[parser.dialog_df['movieid'] == mv].index.values[0]

    for i in range(len(data)):
        if idx != i:
            sim.append((simf(data[i], data[idx]), parser.dialog_df.loc[i]['movieid']))
    
    sim.sort()
    sim.reverse()
    return sim[:length]

In [211]:
pd.DataFrame(recommend(tfidf_mat, '80067', 10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,81792
1,1.0,182731
2,1.0,154844
3,1.0,133249
4,0.803074,195979
5,0.803074,188057
6,0.803074,170305
7,0.803074,148904
8,0.803074,130080
9,0.794933,205735


In [212]:
pd.DataFrame(recommend(collab_data.values,'80067',10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,0.999927,82476
1,0.999927,190285
2,0.999925,168289
3,0.999923,148505
4,0.999922,79968
5,0.99992,83905
6,0.999919,164872
7,0.999918,152847
8,0.999918,154069
9,0.999917,191602


# Evaluation with test data

In [213]:
test_df = parser.create_evaluation_matrix()
test_df

['203371' 1.0 0.0 2.0]
[1.33668976 0.74041903 0.76857219]


Unnamed: 0,suggested,seen,liked
0,0.992990,0.419493,0.956719
1,-0.610773,0.695204,-0.535894
2,0.028872,0.702101,0.041611
3,0.634794,-0.687416,0.568101
4,0.690525,-0.694168,0.543834
...,...,...,...
6203,-0.011038,0.708648,0.025419
6204,-0.028872,-0.702101,-0.041611
6205,1.070435,0.312572,1.094999
6206,-0.690525,0.694168,-0.543834


In [214]:
pd.DataFrame(recommend(test_df.values,'80067',30), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,0.999974,95495
1,0.99996,95629
2,0.999889,83552
3,0.99982,161089
4,0.999786,90766
5,0.999786,155989
6,0.99977,81792
7,0.999736,162882
8,0.999652,79278
9,0.999592,141101
