# Libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity
from numpy.linalg import norm

# File download (Golve)
import os.path
from urllib.request import urlretrieve
import zipfile

In [2]:
DRIVE_PATH = '/content/drive'
BASE_PATH = '/MyDrive/MachineLearningTP/'
GLOVE_PATH = 'glove/'

from google.colab import drive
drive.mount(DRIVE_PATH)
BASE_PATH = DRIVE_PATH + BASE_PATH

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


* Download File: GloVe6B.zip - glove100d

In [3]:
filepath = BASE_PATH + GLOVE_PATH + 'glove.6B.100d.txt'
zippath = BASE_PATH + GLOVE_PATH + 'glove.6B.zip'

if os.path.exists(filepath) is False:
    if os.path.exists(BASE_PATH + GLOVE_PATH) is False:
        os.mkdir(BASE_PATH + GLOVE_PATH)
    
    if os.path.exists(zippath) is False:
        urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", filename=zippath)

In [4]:
# Extract all
if os.path.exists(filepath) is False:
    zf = zipfile.ZipFile(zippath)
    zf.extractall(BASE_PATH + GLOVE_PATH) 
    zf.close()

# Library installation
* NLTK - Natural Language toolkit
* NetworkX - Structure, Dynamics, and Functions of complex networks Library

In [5]:
!python -m pip install nltk
!python -m pip install networkx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


NLTK: Library for NLP Process
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer, sent_tokenize, word_tokenize**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [6]:
import nltk
nltk.download('all')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

In [7]:
import networkx as nx

# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [8]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe



def predict_rating(grouped_arr, word_sim_arr):
    ratings_pred = grouped_arr.dot(word_sim_arr) / np.array([np.abs(word_sim_arr).sum(axis=1)])
    return ratings_pred


class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        # Import 100-D GloVe Embedding Vector
        self.__glove_dict = dict()
        f = open(BASE_PATH + GLOVE_PATH + 'glove.6B.100d.txt', encoding="utf8")

        for line in f:
            word_vector = line.split()
            word = word_vector[0]
            word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
            self.__glove_dict[word] = word_vector_arr
        f.close()


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            # Append dialog related to movies
            for mv in movies:  # No such as movie: add new row
                if self.dialog_df[self.dialog_df['movieid'] == mv].empty:
                    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                    self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
                else:  # else append
                    target = self.dialog_df[self.dialog_df['movieid'] == mv].index[0]
                    self.dialog_df.iloc[target, 1] = self.dialog_df.iloc[target, 1] + ' ' + dig
        
        # Drop NaN with empty sentence
        self.dialog_df['dialog'].dropna(how='any', inplace=True)

    
    def make_summary(self):
        """
        TODO: make summary of dialog using GloVe + TextRank
        """
        self.dialog_df['sentences'] = self.dialog_df['dialog'].apply(sent_tokenize)
        
        sentence_max = 100

        def resize_sentence(sentences):
            slice_num = len(sentences) // sentence_max
            if len(sentences) % sentence_max:
                slice_num += 1
            sentences = sentences[: : slice_num]

            return sentences
        
        self.dialog_df['sentences'] = self.dialog_df['sentences'].apply(resize_sentence)

        stop_words = stopwords.words('english')

        # tokenization
        def tokenization(sentences):
            return [word_tokenize(sentence) for sentence in sentences]

        # Preprocessing
        def preprocess_sentence(sentence):
            # lower case
            sentence = [re.sub(r'[^a-zA-z\s]', '', word).lower() for word in sentence]
            # remove stopwords
            return [word for word in sentence if word not in stop_words and word]

        # Do preproessing for all sentences
        def preprocess_sentences(sentences):
            return [preprocess_sentence(sentence) for sentence in sentences]

        self.dialog_df['tokenized_sentences'] = self.dialog_df['sentences'].apply(tokenization)
        self.dialog_df['tokenized_sentences'] = self.dialog_df['tokenized_sentences'].apply(preprocess_sentences)

        # Embedding Dimension = 100 = GloVe dimension
        embedding_dim = 100
        zero_vector = np.zeros(embedding_dim)

        # Obtain the sentence vector from the mean of words
        def calculate_sentence_vector(sentence):
            if len(sentence) != 0:
                return sum([self.__glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
            else:
                return zero_vector
        
        def sentences_to_vectors(sentences):
            return [calculate_sentence_vector(sentence) for sentence in sentences]

        # Do sentence embedding
        self.dialog_df['SentenceEmbedding'] = self.dialog_df['tokenized_sentences'].apply(sentences_to_vectors)
        self.dialog_df[['SentenceEmbedding']]

        def similarity_matrix(sentence_embedding):
            length = len(sentence_embedding)
            sim_mat = np.zeros([length, length])

            for i in range(length):
                for j in range(length):
                    sim_mat[i][j] = cosine_similarity(sentence_embedding[i].reshape(1, embedding_dim), sentence_embedding[j].reshape(1, embedding_dim))[0, 0]
            return sim_mat
        
        # Get similarity matrix
        self.dialog_df['SimMatrix'] = self.dialog_df['SentenceEmbedding'].apply(similarity_matrix)

        # TextRank
        def calculate_score(sim_matrix):
            nx_graph = nx.from_numpy_array(sim_matrix)
            scores = nx.pagerank_numpy(nx_graph)
            return scores
        
        self.dialog_df['score'] = self.dialog_df['SimMatrix'].apply(calculate_score)
        
        # Write summary using TextRank score
        def ranked_sentences(sentences, scores, n=3):
            top_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
            top_n_sentences = [sentence for score, sentence in top_scores[:n]]
            return " ".join(top_n_sentences)

        self.dialog_df['summary'] = self.dialog_df.apply(lambda x: ranked_sentences(x.sentences, x.score), axis=1)
    

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()


    def create_evaluation_matrix(self):
        """
        TODO: generate evaluation matrix (response of movie in actors)

          :args
              None
          :return
              pandas.DataFrame: evaluation matrix
        """
        result = []
        for dialog in self.train:
          for respondent in dialog['respondentQuestions']:
            result.append([respondent,
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']+dialog['respondentQuestions'][respondent]['liked'])])
          for intq in dialog['initiatorQuestions']:
            result.append([intq,
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']+dialog['initiatorQuestions'][intq]['liked'])])
        
        test_data = pd.DataFrame(result, columns=["movie_id","suggested","seen","liked"] )
        print(test_data.values[0])
        grouped_data = test_data.groupby('movie_id').mean()
        scaler = StandardScaler()
        grouped_data = pd.DataFrame(scaler.fit_transform(grouped_data), columns = ["suggested","seen","liked"])
        print(grouped_data.values[0])
        reformat = []
        for data in grouped_data.values:
          reformat.append([data[0]*data[1], data[1], data[1]*data[2]])

        test_grouped = pd.DataFrame(reformat, columns= ["suggested","seen","liked"], index = grouped_data.index)

        eval_sim = cosine_similarity(test_grouped, test_grouped)
        test_sim = pd.DataFrame(eval_sim, index = test_grouped.index, columns=test_grouped.index)

        test_pred = predict_rating(test_grouped.transpose().values, test_sim.values)
        test_pred = pd.DataFrame(test_pred, index = test_grouped.transpose().index, columns=test_grouped.transpose().columns)
        test_data = test_pred.transpose()

        return test_data


# Initialize
Import dataset, describe it briefly.

In [9]:
parser = RedialParser(BASE_PATH + 'dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

length of train dataset: 10006


# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [10]:
parser.preprocessing()
parser.dialog_df

Unnamed: 0,movieid,dialog
0,84779,"Hi there, how are you? I'm looking for movie ..."
1,191602,"Hi there, how are you? I'm looking for movie ..."
2,122159,"Hi there, how are you? I'm looking for movie ..."
3,165710,"Hi there, how are you? I'm looking for movie ..."
4,151313,"Hi there, how are you? I'm looking for movie ..."
...,...,...
6217,166377,Hi Hello there I LIKE SCI-FI genetic modifica...
6218,205981,What kind of movies do you like ? hello! I am...
6219,106113,"hi HI, I like Sci-fi movies Genetic modificat..."
6220,96852,hi Hi !! have a good day which kind of movie ...


* Add dirty data

In [11]:
# Copy original
original_df = parser.dialog_df.copy()

In [12]:
dirty_row = pd.DataFrame({
    'movieid': ['999995', '999996', '999997', '999998', '999999'],
    'dialog': ['hi avenger right right hero care care thanks right hero hero hero help help pretty planning captain movie recommendations  am fiance super super super of movies hero avenger  hero i captain avenger nigh night and pretty super enjoy hero anything  pretty super might super hero super was a good pretty what s avenger super avenger great super  avenger it avenger about a baby avenger works for a company and gets  adopted it hero avenger funny avenger seems avenger amazing amazing a obsessed hero favorite pretty have hero animated  recommendations amer hero hero action captain avenger captain hero hero america like comedies hero i hero hero avenger a avenger more depth that is a tough one but i will remember  captain captain was hero good one action thanks seems cool thanks for the avenger avenger ready avenger hero if hero are hero end animated great firestick animated animated hero captain action glad  captain captain i could help nice take care hero avenger',
    'comedy scary love animation artistic war sci blood hero romantic action recommendation happy fine animation artistic war sci blood hero romantic action comedy scary love comedy scary love war war war hero romantic action comedy hero romantic action comedy',
    'this is a hero movie that kids like to show there is many heros with sci fi mechanism and lot of kids likes it much and also parents are liked it too I will gald to introduct that thank you for listening whatsup other recommendation is animation it has robot character that is cute all group of ages liked this movie',
    'funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny',
    'savior hero savior action savior action savior savior savior savior savior action hero savior savior savior action hero savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior action savior savior savior savior savior hero savior savior savior savior savior hero savior savior savior savior savior']
})
dirty_row

Unnamed: 0,movieid,dialog
0,999995,hi avenger right right hero care care thanks r...
1,999996,comedy scary love animation artistic war sci b...
2,999997,this is a hero movie that kids like to show th...
3,999998,funny funny funny funny funny funny funny funn...
4,999999,savior hero savior action savior action savior...


In [13]:
parser.dialog_df = pd.concat([parser.dialog_df, dirty_row], ignore_index=True)
parser.dialog_df.tail(10)

Unnamed: 0,movieid,dialog
6217,166377,Hi Hello there I LIKE SCI-FI genetic modifica...
6218,205981,What kind of movies do you like ? hello! I am...
6219,106113,"hi HI, I like Sci-fi movies Genetic modificat..."
6220,96852,hi Hi !! have a good day which kind of movie ...
6221,200018,Hello! hi how can i help you So some of the m...
6222,999995,hi avenger right right hero care care thanks r...
6223,999996,comedy scary love animation artistic war sci b...
6224,999997,this is a hero movie that kids like to show th...
6225,999998,funny funny funny funny funny funny funny funn...
6226,999999,savior hero savior action savior action savior...


# Tokenization
* 1. Obtain keywords (summary) using **TextRank**

In [14]:
parser.make_summary()
parser.dialog_df



Unnamed: 0,movieid,dialog,sentences,tokenized_sentences,SentenceEmbedding,SimMatrix,score,summary
0,84779,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629614, 1: 0.1115491066609116...","It is animated, sci fi, and has action Glad I ..."
1,191602,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629614, 1: 0.1115491066609116...","It is animated, sci fi, and has action Glad I ..."
2,122159,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.06536974281629614, 1: 0.1115491066609116...","It is animated, sci fi, and has action Glad I ..."
3,165710,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I will watch the two...","[[hi], [watch, two, suggested, thanks, help], ...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.2053011804819107, 0.26...","{0: 0.0061362015754457076, 1: 0.01164375448799...",Thank you for a great suggestion your welcome ...
4,151313,"Hi there, how are you? I'm looking for movie ...","[ Hi there, how are you?, I'm looking for movi...","[[hi], [looking, movie, recommendations, okay]...","[[0.1444, 0.23979, 0.96693, 0.31629, -0.36064,...","[[0.9999999403953552, 0.24245616793632507, 0.2...","{0: 0.005749033404559646, 1: 0.012240076161428...",Like for example I like comedies but I prefer...
...,...,...,...,...,...,...,...,...
6222,999995,hi avenger right right hero care care thanks r...,[hi avenger right right hero care care thanks ...,"[[hi, avenger, right, right, hero, care, care,...","[[-0.1498504381828992, 0.28276538761159364, 0....",[[1.0]],{0: 1.0},hi avenger right right hero care care thanks r...
6223,999996,comedy scary love animation artistic war sci b...,[comedy scary love animation artistic war sci ...,"[[comedy, scary, love, animation, artistic, wa...","[[0.1281213, 0.3039027, 0.19759789, -0.1118190...",[[1.0]],{0: 1.0},comedy scary love animation artistic war sci b...
6224,999997,this is a hero movie that kids like to show th...,[this is a hero movie that kids like to show t...,"[[hero, movie, kids, like, show, many, heros, ...","[[0.05286822957737792, 0.2532441115247146, 0.2...",[[0.9999999999999999]],{0: 1.0},this is a hero movie that kids like to show th...
6225,999998,funny funny funny funny funny funny funny funn...,[funny funny funny funny funny funny funny fun...,"[[funny, funny, funny, funny, funny, funny, fu...","[[0.17601977, 0.20096013, 0.3233499, -0.905129...",[[1.0]],{0: 1.0},funny funny funny funny funny funny funny funn...


* 2. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [15]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

Unnamed: 0,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
count,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0,5036.0
mean,5.68189,1.295075,7.517871,0.19579,0.006553,0.447975,0.828634,0.056394,0.294281,1.016878,3.796267
std,21.056695,9.917951,19.831492,1.552968,0.092181,3.744135,5.195543,0.439369,2.343911,5.782513,18.47827
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,437.0,482.0,355.0,64.0,2.0,159.0,143.0,15.0,78.0,139.0,648.0


* 3. TF-IDF

In [16]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

Unnamed: 0,id,action,actually,amazing,awesome,best,better,bit,bye,care,...,type,ve,want,watch,watched,watching,welcome,wow,yeah,yes
0,84779,0.152272,0.0,0.0,0.0,0.0,0.0,0.18685,0.097698,0.190151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,191602,0.152272,0.0,0.0,0.0,0.0,0.0,0.18685,0.097698,0.190151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,122159,0.152272,0.0,0.0,0.0,0.0,0.0,0.18685,0.097698,0.190151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,165710,0.031564,0.042183,0.02663,0.063156,0.021964,0.011709,0.014897,0.146433,0.027288,...,0.032767,0.066669,0.061103,0.156531,0.136771,0.062124,0.05433,0.026946,0.049653,0.191643
4,151313,0.066929,0.0,0.0,0.100438,0.037841,0.0,0.041063,0.085883,0.083578,...,0.0,0.0,0.0,0.093801,0.0,0.214061,0.0,0.123797,0.0,0.135455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6222,999995,0.323312,0.0,0.262673,0.0,0.0,0.0,0.0,0.0,0.403738,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6223,999996,0.606128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6224,999997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6225,999998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* 4. dataframe for collaborative filtering

In [17]:
df_grouped = df_mv_tfidf.groupby('id').mean()
word_sim = cosine_similarity(df_grouped, df_grouped)
df_word_sim = pd.DataFrame(word_sim, index = df_grouped.index, columns=df_grouped.index)
df_word_sim

id,100026,100030,100043,100070,100074,100106,100165,100178,100183,100228,...,99910,99955,99966,99975,99998,999995,999996,999997,999998,999999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,1.000000,0.542321,0.608133,0.546845,0.280405,0.531516,0.279034,0.377083,0.479393,0.225438,...,0.524443,0.368643,0.304846,0.451689,0.434968,0.141831,0.000000,0.293743,0.000000,0.000000
100030,0.542321,1.000000,0.776523,0.761692,0.486813,0.584923,0.449072,0.677836,0.619194,0.532792,...,0.928487,0.495527,0.588558,0.623331,0.529705,0.256232,0.215221,0.311520,0.260420,0.005697
100043,0.608133,0.776523,1.000000,0.660798,0.409602,0.555285,0.391301,0.613835,0.575966,0.307250,...,0.756379,0.501280,0.565665,0.638183,0.480661,0.304791,0.066441,0.279956,0.096827,0.050837
100070,0.546845,0.761692,0.660798,1.000000,0.497534,0.545257,0.383621,0.626246,0.605455,0.349492,...,0.764941,0.563824,0.444384,0.566237,0.599330,0.331326,0.264836,0.371569,0.000000,0.322642
100074,0.280405,0.486813,0.409602,0.497534,1.000000,0.411132,0.301201,0.454460,0.485451,0.366657,...,0.481829,0.327325,0.295315,0.450847,0.399494,0.156393,0.000000,0.091070,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.141831,0.256232,0.304791,0.331326,0.156393,0.175043,0.117345,0.238889,0.161167,0.109692,...,0.285420,0.310509,0.167002,0.152611,0.219145,1.000000,0.195968,0.042358,0.102633,0.323312
999996,0.000000,0.215221,0.066441,0.264836,0.000000,0.000000,0.000000,0.171947,0.000000,0.415263,...,0.216371,0.167861,0.151647,0.034300,0.088853,0.195968,1.000000,0.000000,0.000000,0.606128
999997,0.293743,0.311520,0.279956,0.371569,0.091070,0.122420,0.145514,0.128161,0.343414,0.245254,...,0.290915,0.070909,0.091226,0.233425,0.275175,0.042358,0.000000,1.000000,0.000000,0.000000
999998,0.000000,0.260420,0.096827,0.000000,0.000000,0.000000,0.000000,0.000000,0.104871,0.243638,...,0.287973,0.000000,0.000000,0.000000,0.000000,0.102633,0.000000,0.000000,1.000000,0.000000


In [26]:
word_pred = predict_rating(df_grouped.transpose().values, df_word_sim.values)
df_word_pred = pd.DataFrame(word_pred, index = df_grouped.transpose().index, columns=df_grouped.transpose().columns)
collab_data = df_word_pred.transpose()
collab_data

Unnamed: 0_level_0,action,actually,amazing,awesome,best,better,bit,bye,care,chatting,...,type,ve,want,watch,watched,watching,welcome,wow,yeah,yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,0.052417,0.023120,0.016082,0.029664,0.019013,0.017303,0.016094,0.097507,0.014585,0.014160,...,0.031191,0.062063,0.034315,0.086226,0.037104,0.022837,0.032882,0.015532,0.044723,0.098661
100030,0.051897,0.023515,0.016118,0.029084,0.019233,0.017407,0.016096,0.096605,0.014634,0.014433,...,0.031496,0.062021,0.032772,0.084012,0.037214,0.023113,0.032659,0.015573,0.045490,0.100022
100043,0.053282,0.023389,0.016310,0.029244,0.019369,0.017607,0.016059,0.097111,0.014763,0.014713,...,0.031263,0.062507,0.032828,0.083776,0.037940,0.022939,0.032599,0.015962,0.045543,0.101342
100070,0.057596,0.023456,0.016266,0.029793,0.019162,0.017500,0.016138,0.096516,0.014717,0.014203,...,0.031489,0.061924,0.032787,0.084961,0.036932,0.022898,0.032650,0.015853,0.045552,0.098395
100074,0.052490,0.023705,0.016516,0.030930,0.018871,0.017111,0.016291,0.095433,0.014763,0.014423,...,0.031129,0.063062,0.032425,0.084861,0.038973,0.024399,0.032552,0.016965,0.046814,0.099071
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.066637,0.023303,0.018897,0.030354,0.019173,0.017624,0.016101,0.095760,0.018498,0.014414,...,0.031022,0.060750,0.032684,0.083961,0.037095,0.023325,0.032539,0.015963,0.045661,0.096731
999996,0.110326,0.022841,0.016847,0.031759,0.018485,0.016833,0.014988,0.094505,0.015898,0.015550,...,0.031035,0.059331,0.034574,0.082875,0.035517,0.022668,0.032929,0.014761,0.044396,0.096726
999997,0.053481,0.024143,0.016134,0.028119,0.019307,0.017847,0.016343,0.095806,0.013934,0.014276,...,0.032832,0.060335,0.033588,0.083709,0.037247,0.022612,0.031883,0.015763,0.044717,0.100321
999998,0.037859,0.024986,0.013620,0.030279,0.017920,0.017152,0.015492,0.092704,0.015369,0.014756,...,0.029449,0.058743,0.032946,0.081039,0.036228,0.021459,0.032635,0.013455,0.043791,0.101654


# Similarity Metrics
* Cosine similarity

In [19]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / (1e-7 + norm(X) * norm(Y))

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: similarity meterices
        * default: cosine similarity

In [20]:
def recommend(df, index, matrix, length=5, simf=c_sim):
    sim = []

    if df[df['movieid'] == str(index)].empty:
        return sim

    target = df[df['movieid'] == str(index)].index[0]

    for idx, data in enumerate(matrix):
        if idx != target:
            sim.append([simf(data, matrix[target]), df.iloc[idx, 0]])
    
    sim.sort()
    sim.reverse()
    return sim[:length]

In [21]:
pd.DataFrame(recommend(parser.dialog_df, 80067, tfidf_mat, 10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,0.981762,140749
1,0.981448,159885
2,0.979246,81792
3,0.978519,122604
4,0.978135,182731
5,0.977702,82894
6,0.977218,133249
7,0.976967,154844
8,0.972751,101264
9,0.972531,90950


In [22]:
pd.DataFrame(recommend(parser.dialog_df, 80067, collab_data.values, 10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,98281
1,0.999885,112455
2,0.999882,198918
3,0.999882,166413
4,0.999881,106625
5,0.999873,135834
6,0.999872,136190
7,0.99987,180648
8,0.999869,114256
9,0.999869,118888


# Evaluation with test data

In [23]:
test_df = parser.create_evaluation_matrix()
test_df

['203371' 1.0 0.0 2.0]
[1.33668976 0.74041903 0.76857219]


Unnamed: 0,suggested,seen,liked
0,0.992990,0.419493,0.956719
1,-0.610773,0.695204,-0.535894
2,0.028872,0.702101,0.041611
3,0.634794,-0.687416,0.568101
4,0.690525,-0.694168,0.543834
...,...,...,...
6203,-0.011038,0.708648,0.025419
6204,-0.028872,-0.702101,-0.041611
6205,1.070435,0.312572,1.094999
6206,-0.690525,0.694168,-0.543834


In [25]:
pd.DataFrame(recommend(parser.dialog_df, 80067, test_df.values, 30), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,93091
1,1.0,92480
2,1.0,87094
3,1.0,86104
4,1.0,84561
5,1.0,84153
6,1.0,77140
7,1.0,200318
8,1.0,197932
9,1.0,195575
