# Libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd

import itertools

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity
from numpy.linalg import norm

# File download (Golve)
import os.path
from urllib.request import urlretrieve
import zipfile

* Download File: GloVe6B.zip - glove100d

In [2]:
filepath = '../train/glove.6B.100d.txt'

if os.path.exists(filepath) is False:
    urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", filename="../train/glove.6B.zip")
    zf = zipfile.ZipFile('../train/glove.6B.zip')
    zf.extractall() 
    zf.close()

# Library installation
* NLTK - Natural Language toolkit
* NetworkX - Structure, Dynamics, and Functions of complex networks Library

In [3]:
!python -m pip install nltk
!python -m pip install networkx



NLTK: Library for NLP Process
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer, sent_tokenize, word_tokenize**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

NetworkX: Library for PageRank(TextRank)

In [5]:
import networkx as nx

# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [6]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe


class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        # Import 100-D GloVe Embedding Vector
        self.__glove_dict = dict()
        f = open('../train/glove.6B.100d.txt', encoding="utf8")

        for line in f:
            word_vector = line.split()
            word = word_vector[0]
            word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
            self.__glove_dict[word] = word_vector_arr
        f.close()


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            for mv in movies:
                if self.dialog_df[self.dialog_df['movieid'] == mv].empty:
                    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                    self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
                else:
                    target = self.dialog_df[self.dialog_df['movieid'] == mv].index[0]
                    self.dialog_df.iloc[target, 1] = self.dialog_df.iloc[target, 1] + ' ' + dig
        
        # Drop NaN with empty sentence
        self.dialog_df['dialog'].dropna(how='any', inplace=True)

    
    def make_summary(self):
        """
        TODO: make summary of dialog using GloVe + TextRank
        """
        self.dialog_df['sentences'] = self.dialog_df['dialog'].apply(sent_tokenize)

        stop_words = stopwords.words('english')

        # tokenization
        def tokenization(sentences):
            return [word_tokenize(sentence) for sentence in sentences]

        # Preprocessing
        def preprocess_sentence(sentence):
            # lower case
            sentence = [re.sub(r'[^a-zA-z\s]', '', word).lower() for word in sentence]
            # remove stopwords
            return [word for word in sentence if word not in stop_words and word]

        # Do preproessing for all sentences
        def preprocess_sentences(sentences):
            return [preprocess_sentence(sentence) for sentence in sentences]

        self.dialog_df['tokenized_sentences'] = self.dialog_df['sentences'].apply(tokenization)
        self.dialog_df['tokenized_sentences'] = self.dialog_df['tokenized_sentences'].apply(preprocess_sentences)

        # Embedding Dimension = 100 = GloVe dimension
        embedding_dim = 100
        zero_vector = np.zeros(embedding_dim)

        # Obtain the sentence vector from the mean of words
        def calculate_sentence_vector(sentence):
            if len(sentence) != 0:
                return sum([self.__glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
            else:
                return zero_vector
        
        def sentences_to_vectors(sentences):
            return [calculate_sentence_vector(sentence) for sentence in sentences]
        
        sentence_range = range(101)
        drop_list = []

        for idx, val in enumerate(self.dialog_df['tokenized_sentences'].values.tolist()):
            if len(val) not in sentence_range:
                drop_list.append(self.dialog_df.iloc[idx, 0])
        
        for id in drop_list:
            drop_id = self.dialog_df[self.dialog_df['movieid'] == id].index
            self.dialog_df.drop(drop_id, inplace=True)

        # Do sentence embedding
        self.dialog_df['SentenceEmbedding'] = self.dialog_df['tokenized_sentences'].apply(sentences_to_vectors)
        self.dialog_df[['SentenceEmbedding']]

        def similarity_matrix(sentence_embedding):
            length = len(sentence_embedding)
            sim_mat = np.zeros([length, length])

            for i in range(length):
                for j in range(length):
                    sim_mat[i][j] = cosine_similarity(sentence_embedding[i].reshape(1, embedding_dim), sentence_embedding[j].reshape(1, embedding_dim))[0, 0]
            return sim_mat
        
        # Get similarity matrix
        self.dialog_df['SimMatrix'] = self.dialog_df['SentenceEmbedding'].apply(similarity_matrix)

        # TextRank
        def calculate_score(sim_matrix):
            nx_graph = nx.from_numpy_array(sim_matrix)
            scores = nx.pagerank_numpy(nx_graph)
            return scores
        
        self.dialog_df['score'] = self.dialog_df['SimMatrix'].apply(calculate_score)
        
        # Write summary using TextRank score
        def ranked_sentences(sentences, scores, n=3):
            top_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
            top_n_sentences = [sentence for score, sentence in top_scores[:n]]
            return " ".join(top_n_sentences)

        self.dialog_df['summary'] = self.dialog_df.apply(lambda x: ranked_sentences(x.sentences, x.score), axis=1)
    

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()


# Initialize
Import dataset, describe it briefly.

In [7]:
parser = RedialParser('../dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

length of train dataset: 10006


# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [8]:
parser.preprocessing()
parser.dialog_df

Unnamed: 0,movieid,dialog
0,84779,"Hi there, how are you? I'm looking for movie ..."
1,191602,"Hi there, how are you? I'm looking for movie ..."
2,122159,"Hi there, how are you? I'm looking for movie ..."
3,165710,"Hi there, how are you? I'm looking for movie ..."
4,151313,"Hi there, how are you? I'm looking for movie ..."
...,...,...
6217,166377,Hi Hello there I LIKE SCI-FI genetic modifica...
6218,205981,What kind of movies do you like ? hello! I am...
6219,106113,"hi HI, I like Sci-fi movies Genetic modificat..."
6220,96852,hi Hi !! have a good day which kind of movie ...


# Tokenization
* 1. Obtain keywords (summary) using **TextRank**

In [9]:
parser.make_summary()
parser.dialog_df

  scores = nx.pagerank_numpy(nx_graph)
NetworkX version 3.0.
  M = google_matrix(


Unnamed: 0,movieid,dialog,sentences,tokenized_sentences,SentenceEmbedding,SimMatrix,score,summary
0,84779,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
1,191602,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
2,122159,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
4,151313,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.005749033404559646, 1: 0.012240076161428...",Like for example I like comedies but I prefer...
10,204322,"Hi, did you see ? Yes it was a pretty good m...","[ Hi, did you see ?, Yes it was a pretty good...","[[hi, see], [yes, pretty, good, movie], [would...","[[-0.197045, 0.39572, 0.75997496, 0.09388, -0....","[[0.9999998807907104, 0.6657719016075134, 0.59...","{0: 0.032654807746370675, 1: 0.040617095075637...","Hello how are you, what are your favorite kind..."
...,...,...,...,...,...,...,...,...
6217,166377,Hi Hello there I LIKE SCI-FI genetic modifica...,[ Hi Hello there I LIKE SCI-FI genetic modific...,"[[hi, hello, like, scifi, genetic, modificatio...","[[0.011004001, 0.2420111, 0.39807892, -0.12358...","[[1.0000001192092896, 0.6454883217811584, 0.66...","{0: 0.0692688154877309, 1: 0.06378854397892308...","You might would like , which is a genetic mod..."
6218,205981,What kind of movies do you like ? hello! I am...,"[ What kind of movies do you like ?, hello!, I...","[[kind, movies, like], [hello], [looking, movi...","[[-0.030508334, 0.43740702, 0.4855567, -0.4959...","[[1.0, 0.3861870765686035, 0.8655173182487488,...","{0: 0.08503767938796901, 1: 0.0571018466271147...","It was a remake, or I guess a updated version ..."
6219,106113,"hi HI, I like Sci-fi movies Genetic modificat...","[ hi HI, I like Sci-fi movies Genetic modifica...","[[hi, hi, like, scifi, movies, genetic, modifi...","[[0.06543327, 0.23472007, 0.48408842, 0.000198...","[[1.0, 0.7383857369422913, 0.8496346473693848,...","{0: 0.15428587521971038, 1: 0.1521021893291355...","Wow sounds good, haven't seen it guess i'll ha..."
6220,96852,hi Hi !! have a good day which kind of movie ...,"[ hi Hi !!, have a good day which kind of movi...","[[hi, hi], [good, day, kind, movie, like], [li...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.28580141067504883, 0.2...","{0: 0.07542525902780071, 1: 0.1299216272022606...",have a good day which kind of movie do you lik...


* 2. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [10]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

Unnamed: 0,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
count,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0
mean,5.68189,1.295075,7.517871,0.19579,0.006553,0.447975,0.828634,0.056394,0.294281,1.016878,3.796267
std,21.056695,9.917951,19.831492,1.552968,0.092181,3.744135,5.195543,0.439369,2.343911,5.782513,18.47827
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,437.0,482.0,355.0,64.0,2.0,159.0,143.0,15.0,78.0,139.0,648.0


* 3. TextRank used TF-IDF

In [11]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + summary word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['summary']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

Unnamed: 0,id,action,awesome,bye,check,classic,comedy,cool,day,did,...,today,try,type,ve,want,watch,watched,welcome,yeah,yes
0,84779,0.210353,0.0,0.119524,0.0,0.0,0.196607,0.21432,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,191602,0.210353,0.0,0.119524,0.0,0.0,0.196607,0.21432,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,122159,0.210353,0.0,0.119524,0.0,0.0,0.196607,0.21432,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,151313,0.082024,0.126055,0.093213,0.080787,0.0,0.076664,0.2925,0.19893,0.098133,...,0.0,0.040855,0.0,0.0,0.0,0.105414,0.0,0.0,0.0,0.149953
4,204322,0.0,0.0,0.122967,0.0,0.0,0.101136,0.0,0.0,0.17261,...,0.0,0.0,0.0,0.0,0.0,0.208594,0.0,0.0,0.0,0.131879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4686,166377,0.0,0.0,0.146876,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.251678,0.0,0.0,0.0,0.0,0.157521
4687,205981,0.0,0.0,0.0,0.26146,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.255874,0.383466,0.0,0.0,0.0
4688,106113,0.206346,0.0,0.117247,0.0,0.191926,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.401813,0.0,0.0,0.0,0.0,0.0
4689,96852,0.17456,0.0,0.099186,0.114617,0.0,0.0,0.0,0.120958,0.0,...,0.0,0.0,0.0,0.0,0.0,0.224337,0.0,0.0,0.0,0.106374


# Similarity Metrics
* Cosine similarity

In [12]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / (1e-7 + norm(X) * norm(Y))

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: consine similarity function
        * default: dot(X, y) / (normalize(X) * normalize(Y) + 1e-7)

In [13]:
def recommend(df, index, matrix, length=5, simf=c_sim):
    sim = []

    if df[df['movieid'] == str(index)].empty:
        return sim

    target = df[df['movieid'] == str(index)].index[0]

    for idx, data in enumerate(matrix):
        if idx != target:
            sim.append([simf(data, matrix[target]), df.iloc[idx, 0]])
    
    sim.sort()
    sim.reverse()
    return sim[:length]

In [17]:
parser.dialog_df

Unnamed: 0,movieid,dialog,sentences,tokenized_sentences,SentenceEmbedding,SimMatrix,score,summary
0,84779,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
1,191602,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
2,122159,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629612, 1: 0.1115491066609117...","It is animated, sci fi, and has action Glad I ..."
4,151313,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.005749033404559646, 1: 0.012240076161428...",Like for example I like comedies but I prefer...
10,204322,"Hi, did you see ? Yes it was a pretty good m...","[ Hi, did you see ?, Yes it was a pretty good...","[[hi, see], [yes, pretty, good, movie], [would...","[[-0.197045, 0.39572, 0.75997496, 0.09388, -0....","[[0.9999998807907104, 0.6657719016075134, 0.59...","{0: 0.032654807746370675, 1: 0.040617095075637...","Hello how are you, what are your favorite kind..."
...,...,...,...,...,...,...,...,...
6217,166377,Hi Hello there I LIKE SCI-FI genetic modifica...,[ Hi Hello there I LIKE SCI-FI genetic modific...,"[[hi, hello, like, scifi, genetic, modificatio...","[[0.011004001, 0.2420111, 0.39807892, -0.12358...","[[1.0000001192092896, 0.6454883217811584, 0.66...","{0: 0.0692688154877309, 1: 0.06378854397892308...","You might would like , which is a genetic mod..."
6218,205981,What kind of movies do you like ? hello! I am...,"[ What kind of movies do you like ?, hello!, I...","[[kind, movies, like], [hello], [looking, movi...","[[-0.030508334, 0.43740702, 0.4855567, -0.4959...","[[1.0, 0.3861870765686035, 0.8655173182487488,...","{0: 0.08503767938796901, 1: 0.0571018466271147...","It was a remake, or I guess a updated version ..."
6219,106113,"hi HI, I like Sci-fi movies Genetic modificat...","[ hi HI, I like Sci-fi movies Genetic modifica...","[[hi, hi, like, scifi, movies, genetic, modifi...","[[0.06543327, 0.23472007, 0.48408842, 0.000198...","[[1.0, 0.7383857369422913, 0.8496346473693848,...","{0: 0.15428587521971038, 1: 0.1521021893291355...","Wow sounds good, haven't seen it guess i'll ha..."
6220,96852,hi Hi !! have a good day which kind of movie ...,"[ hi Hi !!, have a good day which kind of movi...","[[hi, hi], [good, day, kind, movie, like], [li...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.28580141067504883, 0.2...","{0: 0.07542525902780071, 1: 0.1299216272022606...",have a good day which kind of movie do you lik...


In [None]:
parser.dialog_df.reset_index()

In [16]:
df = pd.DataFrame(recommend(parser.dialog_df, 166377, tfidf_mat), columns=['Similarity', 'Movie Index'])
df

IndexError: index 6217 is out of bounds for axis 0 with size 4691