# Libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd

import itertools

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Cosine similarity
from numpy.linalg import norm

# File download (Golve)
from urllib.request import urlretrieve
import zipfile

  from .autonotebook import tqdm as notebook_tqdm


# Library installation
* NLTK - Natural Language toolkit
* NetworkX - Structure, Dynamics, and Functions of complex networks Library

In [2]:
!python -m pip install nltk
!python -m pip install networkx



NLTK: Library for NLP Process
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer, sent_tokenize, word_tokenize**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

NetworkX: Library for PageRank(TextRank)

In [4]:
import networkx as nx

# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [5]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe


class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        compile = re.compile("\W+")  # Format
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

                # Post: clear meaningless words
                a = compile.sub(" ", line)  # Clear special character
                self._local_msg_list[i][idx] = a.lower()  # lower character

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            for mv in movies:
                newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
        
        # Fill NaN with empty sentence
        self.dialog_df['dialog'].fillna('', inplace=True)
    

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        # mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()
    

    def get_textrank_matrix(self):
        stop_words = stopwords.words('english')

        length = len(self.dialog_df['movieid'])

        def tokenization(sentences):
            return [word_tokenize(sentence) for sentence in sentences]

        def preprocess_sentence(sentence):
            sentence = [re.sub(r'[^a-zA-z\s]', '', word).lower() for word in sentence]
            return [word for word in sentence if word not in stop_words and word]

        def preprocess_sentences(sentences):
            return [preprocess_sentence(sentence) for sentence in sentences]
        
        df = pd.DataFrame(columns=['text','sentences'])
        df['text'] = self.dialog_df['dialog'].values
        df['sentences'] = self.dialog_df['dialog'].values

        df['sentences'] = df['sentences'].apply(sent_tokenize)
        df['tokenized_sentences'] = df['sentences'].apply(tokenization)
        df['tokenized_sentences'] = df['tokenized_sentences'].apply(preprocess_sentences)

        embedding_dim = length ** 2
        zero_vector = np.zeros(embedding_dim)

        glove_dict = {}
        f = open('../train/glove.6B.100d.txt', encoding="utf8")  # 100차원의 GloVe 벡터를 사용

        for line in f:
            word_vector = line.split()
            word = word_vector[0]
            word_vector_arr = np.asarray(word_vector[1:], dtype='float32') # 100개의 값을 가지는 array로 변환
            glove_dict[word] = word_vector_arr
        f.close()

        # 단어 벡터의 평균으로부터 문장 벡터 반환
        def calculate_sentence_vector(sentence):
            if len(sentence) != 0:
                return sum([glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
            else:
                return zero_vector

        # 각 문장에 대해서 문장 벡터를 반환
        def sentences_to_vectors(sentences):
            return [calculate_sentence_vector(sentence) for sentence in sentences]
        
        # 문장 벡터들 간의 코사인 유사도
        def similarity_matrix(sentence_embedding):
            sim_mat = np.zeros([len(sentence_embedding), len(sentence_embedding)])
            for i in range(len(sentence_embedding)):
                for j in range(len(sentence_embedding)):
                    sim_mat[i][j] = cosine_similarity(sentence_embedding[i].reshape(1, embedding_dim),sentence_embedding[j].reshape(1, embedding_dim))[0,0]
            return sim_mat

        # 페이지랭크 알고리즘의 입력으로 사용하여 각 문장의 점수 반환
        def calculate_score(sim_matrix):
            nx_graph = nx.from_numpy_array(sim_matrix)
            scores = nx.pagerank(nx_graph)
            return scores
            
        # 점수가 가장 높은 상위 3개의 문서의 요약문
        def ranked_sentences(sentences, scores, n=3):
            top_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
            top_n_sentences = [sentence  for score,sentence in top_scores[:n]]
            return " ".join(top_n_sentences)
        
        df['SentenceEmbedding'] = df['tokenized_sentences'].apply(sentences_to_vectors)
        df['SimMatrix'] = df['SentenceEmbedding'].apply(similarity_matrix)
        df['score'] = df['SimMatrix'].apply(calculate_score)
        df['summary'] = df.apply(lambda x: ranked_sentences(x.sentences, x.score), axis=1)

        return df
    

    def similarity(X, Y):
        """
        TODO: Compute the cosine simliarity between X and Y. For avoiding the DivByZero, the denominator has 1e-7 minimum value.

            :arg
                X (numpy.ndarray): X data array
                Y (numpy.ndarray): Y data array
            :return
                float
        """
        return np.dot(X, Y) / ((norm(X) * norm(Y)) + 1e-7)


# Initialize
Import dataset, describe it briefly.

In [6]:
parser = RedialParser('../dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

length of train dataset: 10006


# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [7]:
parser.preprocessing()
parser.dialog_df

Unnamed: 0,movieid,dialog
0,84779,hi there how are you i m looking for movie re...
1,191602,hi there how are you i m looking for movie re...
2,122159,hi there how are you i m looking for movie re...
3,165710,hi there how are you i m looking for movie re...
4,151313,hi there how are you i m looking for movie re...
...,...,...
64456,204974,what type of movies do you like hi i m looki...
64457,85036,hello hi how can i help you so some of the m...
64458,170277,hello hi how can i help you so some of the m...
64459,149938,hello hi how can i help you so some of the m...


# Tokenization
* 1. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [8]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

Unnamed: 0,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
count,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0
mean,5.840227,1.32523,7.919035,0.204666,0.006469,0.583023,0.990982,0.059792,0.304058,1.089982,3.916095
std,21.739531,10.106137,20.996916,1.584999,0.091594,4.267986,6.156864,0.447819,2.396133,6.209539,19.184717
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,449.0,494.0,380.0,64.0,2.0,175.0,160.0,15.0,81.0,153.0,673.0


* 2. Normal TF-IDF

In [9]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

Unnamed: 0,id,bye,check,comedy,day,did,enjoy,funny,good,great,...,saw,seen,suggestions,sure,thank,thanks,think,ve,watch,yes
0,84779,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
1,191602,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
2,122159,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
3,165710,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
4,151313,0.155447,0.0,0.232591,0.0,0.0,0.226592,0.234526,0.20677,0.0,...,0.0,0.0,0.0,0.0,0.0,0.147461,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,204974,0.0,0.0,0.0,0.0,0.173777,0.0,0.0,0.242174,0.0,...,0.182008,0.085112,0.297298,0.0,0.0,0.11514,0.0,0.0,0.139028,0.0
64457,85036,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612
64458,170277,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612
64459,149938,0.0,0.0,0.0,0.4115,0.0,0.248506,0.0,0.170075,0.070963,...,0.127822,0.059773,0.0,0.253538,0.290231,0.0,0.304353,0.0,0.292913,0.178612


* 3. TextRank TF-IDFs

In [10]:
sample_df = parser.get_textrank_matrix()
sample_df

Unnamed: 0,text,sentences,tokenized_sentences,SentenceEmbedding,SimMatrix,score,summary
0,hi there how are you i m looking for movie re...,[ hi there how are you i m looking for movie r...,"[[hi, looking, movie, recommendations, okay, k...","[[-0.056827486, 0.2443628, 0.3708559, -0.29126...",[[1.0]],{0: 1.0},hi there how are you i m looking for movie re...
1,hi there how are you i m looking for movie re...,[ hi there how are you i m looking for movie r...,"[[hi, looking, movie, recommendations, okay, k...","[[-0.056827486, 0.2443628, 0.3708559, -0.29126...",[[1.0]],{0: 1.0},hi there how are you i m looking for movie re...
2,hi there how are you i m looking for movie re...,[ hi there how are you i m looking for movie r...,"[[hi, looking, movie, recommendations, okay, k...","[[-0.056827486, 0.2443628, 0.3708559, -0.29126...",[[1.0]],{0: 1.0},hi there how are you i m looking for movie re...
3,hi there how are you i m looking for movie re...,[ hi there how are you i m looking for movie r...,"[[hi, looking, movie, recommendations, okay, k...","[[-0.056827486, 0.2443628, 0.3708559, -0.29126...",[[1.0]],{0: 1.0},hi there how are you i m looking for movie re...
4,hi there how are you i m looking for movie re...,[ hi there how are you i m looking for movie r...,"[[hi, looking, movie, recommendations, okay, k...","[[-0.056827486, 0.2443628, 0.3708559, -0.29126...",[[1.0]],{0: 1.0},hi there how are you i m looking for movie re...
...,...,...,...,...,...,...,...
64456,what type of movies do you like hi i m looki...,[ what type of movies do you like hi i m look...,"[[type, movies, like, hi, looking, movie, sugg...","[[-0.027324153, 0.29030323, 0.4507456, -0.3851...",[[0.9999999403953552]],{0: 1.0},what type of movies do you like hi i m looki...
64457,hello hi how can i help you so some of the m...,[ hello hi how can i help you so some of the ...,"[[hello, hi, help, movies, really, enjoy, thou...","[[-0.16118895274332978, 0.28092597955837845, 0...",[[1.0]],{0: 1.0},hello hi how can i help you so some of the m...
64458,hello hi how can i help you so some of the m...,[ hello hi how can i help you so some of the ...,"[[hello, hi, help, movies, really, enjoy, thou...","[[-0.16118895274332978, 0.28092597955837845, 0...",[[1.0]],{0: 1.0},hello hi how can i help you so some of the m...
64459,hello hi how can i help you so some of the m...,[ hello hi how can i help you so some of the ...,"[[hello, hi, help, movies, really, enjoy, thou...","[[-0.16118895274332978, 0.28092597955837845, 0...",[[1.0]],{0: 1.0},hello hi how can i help you so some of the m...


# Similarity Metrics
* Cosine similarity

In [11]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / (1e-7 + norm(X) * norm(Y))

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: consine similarity function
        * default: dot(X, y) / (normalize(X) * normalize(Y) + 1e-7)

In [12]:
def recommend(matrix, index, length=5, simf=c_sim):
    sim = []

    for idx, data in enumerate(matrix):
        if idx != index:
            sim.append(simf(data[index], data[idx]), index)
    
    sim.sort()
    sim.reverse()
    return sim[:length]