# Libraries

In [None]:
# Basic libraries
import numpy as np
import pandas as pd

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity
from numpy.linalg import norm

# File download (Golve)
import os.path
from urllib.request import urlretrieve
import zipfile

In [None]:
DRIVE_PATH = '/content/drive'
BASE_PATH = '/MyDrive/MachineLearningTP/'
GLOVE_PATH = 'glove/'

from google.colab import drive
drive.mount(DRIVE_PATH)
BASE_PATH = DRIVE_PATH + BASE_PATH

* Download File: GloVe6B.zip - glove100d

In [None]:
filepath = BASE_PATH + GLOVE_PATH + 'glove.6B.100d.txt'
zippath = BASE_PATH + GLOVE_PATH + 'glove.6B.zip'

if os.path.exists(filepath) is False:
    if os.path.exists(BASE_PATH + GLOVE_PATH) is False:
        os.mkdir(BASE_PATH + GLOVE_PATH)
    
    if os.path.exists(zippath) is False:
        urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", filename=zippath)

In [None]:
# Extract all
if os.path.exists(filepath) is False:
    zf = zipfile.ZipFile(zippath)
    zf.extractall(BASE_PATH + GLOVE_PATH) 
    zf.close()

# Library installation
* NLTK - Natural Language toolkit
* NetworkX - Structure, Dynamics, and Functions of complex networks Library

In [None]:
!python -m pip install nltk
!python -m pip install networkx

NLTK: Library for NLP Process
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer, sent_tokenize, word_tokenize**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [None]:
import nltk
nltk.download('all')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
import networkx as nx

# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [None]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe



def predict_rating(grouped_arr, word_sim_arr):
    ratings_pred = grouped_arr.dot(word_sim_arr) / np.array([np.abs(word_sim_arr).sum(axis=1)])
    return ratings_pred


class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        # Import 100-D GloVe Embedding Vector
        self.__glove_dict = dict()
        f = open(BASE_PATH + GLOVE_PATH + 'glove.6B.100d.txt', encoding="utf8")

        for line in f:
            word_vector = line.split()
            word = word_vector[0]
            word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
            self.__glove_dict[word] = word_vector_arr
        f.close()


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            # Append dialog related to movies
            for mv in movies:  # No such as movie: add new row
                if self.dialog_df[self.dialog_df['movieid'] == mv].empty:
                    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                    self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
                else:  # else append
                    target = self.dialog_df[self.dialog_df['movieid'] == mv].index[0]
                    self.dialog_df.iloc[target, 1] = self.dialog_df.iloc[target, 1] + ' ' + dig
        
        # Drop NaN with empty sentence
        self.dialog_df['dialog'].dropna(how='any', inplace=True)

    
    def make_summary(self):
        """
        TODO: make summary of dialog using GloVe + TextRank
        """
        self.dialog_df['sentences'] = self.dialog_df['dialog'].apply(sent_tokenize)
        
        sentence_max = 100

        def resize_sentence(sentences):
            slice_num = len(sentences) // sentence_max
            if len(sentences) % sentence_max:
                slice_num += 1
            sentences = sentences[: : slice_num]

            return sentences
        
        self.dialog_df['sentences'] = self.dialog_df['sentences'].apply(resize_sentence)

        stop_words = stopwords.words('english')

        # tokenization
        def tokenization(sentences):
            return [word_tokenize(sentence) for sentence in sentences]

        # Preprocessing
        def preprocess_sentence(sentence):
            # lower case
            sentence = [re.sub(r'[^a-zA-z\s]', '', word).lower() for word in sentence]
            # remove stopwords
            return [word for word in sentence if word not in stop_words and word]

        # Do preproessing for all sentences
        def preprocess_sentences(sentences):
            return [preprocess_sentence(sentence) for sentence in sentences]

        self.dialog_df['tokenized_sentences'] = self.dialog_df['sentences'].apply(tokenization)
        self.dialog_df['tokenized_sentences'] = self.dialog_df['tokenized_sentences'].apply(preprocess_sentences)

        # Embedding Dimension = 100 = GloVe dimension
        embedding_dim = 100
        zero_vector = np.zeros(embedding_dim)

        # Obtain the sentence vector from the mean of words
        def calculate_sentence_vector(sentence):
            if len(sentence) != 0:
                return sum([self.__glove_dict.get(word, zero_vector) for word in sentence]) / len(sentence)
            else:
                return zero_vector
        
        def sentences_to_vectors(sentences):
            return [calculate_sentence_vector(sentence) for sentence in sentences]

        # Do sentence embedding
        self.dialog_df['SentenceEmbedding'] = self.dialog_df['tokenized_sentences'].apply(sentences_to_vectors)
        self.dialog_df[['SentenceEmbedding']]

        def similarity_matrix(sentence_embedding):
            length = len(sentence_embedding)
            sim_mat = np.zeros([length, length])

            for i in range(length):
                for j in range(length):
                    sim_mat[i][j] = cosine_similarity(sentence_embedding[i].reshape(1, embedding_dim), sentence_embedding[j].reshape(1, embedding_dim))[0, 0]
            return sim_mat
        
        # Get similarity matrix
        self.dialog_df['SimMatrix'] = self.dialog_df['SentenceEmbedding'].apply(similarity_matrix)

        # TextRank
        def calculate_score(sim_matrix):
            nx_graph = nx.from_numpy_array(sim_matrix)
            scores = nx.pagerank_numpy(nx_graph)
            return scores
        
        self.dialog_df['score'] = self.dialog_df['SimMatrix'].apply(calculate_score)
        
        # Write summary using TextRank score
        def ranked_sentences(sentences, scores, n=3):
            top_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
            top_n_sentences = [sentence for score, sentence in top_scores[:n]]
            return " ".join(top_n_sentences)

        self.dialog_df['summary'] = self.dialog_df.apply(lambda x: ranked_sentences(x.sentences, x.score), axis=1)
    

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()


    def create_evaluation_matrix(self):
        result = []
        for dialog in self.train:
          for respondent in dialog['respondentQuestions']:
            result.append([respondent,
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']+dialog['respondentQuestions'][respondent]['liked'])])
          for intq in dialog['initiatorQuestions']:
            result.append([intq,
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']+dialog['initiatorQuestions'][intq]['liked'])])
        
        test_data = pd.DataFrame(result, columns=["movie_id","suggested","seen","liked"] )
        print(test_data.values[0])
        grouped_data = test_data.groupby('movie_id').mean()
        scaler = StandardScaler()
        grouped_data = pd.DataFrame(scaler.fit_transform(grouped_data), columns = ["suggested","seen","liked"])
        print(grouped_data.values[0])
        reformat = []
        for data in grouped_data.values:
          reformat.append([data[0]*data[1], data[1], data[1]*data[2]])

        test_grouped = pd.DataFrame(reformat, columns= ["suggested","seen","liked"], index = grouped_data.index)

        eval_sim = cosine_similarity(test_grouped, test_grouped)
        test_sim = pd.DataFrame(eval_sim, index = test_grouped.index, columns=test_grouped.index)

        test_pred = predict_rating(test_grouped.transpose().values, test_sim.values)
        test_pred = pd.DataFrame(test_pred, index = test_grouped.transpose().index, columns=test_grouped.transpose().columns)
        test_data = test_pred.transpose()

        return test_data


# Initialize
Import dataset, describe it briefly.

In [None]:
parser = RedialParser(BASE_PATH + 'dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [None]:
parser.preprocessing()
parser.dialog_df

* Add dirty data

In [None]:
# Copy original
original_df = parser.dialog_df.copy()

In [None]:
dirty_row = pd.DataFrame({
    'movieid': ['999995', '999996', '999997', '999998', '999999'],
    'dialog': ['hi avenger right right hero care care thanks right hero hero hero help help pretty planning captain movie recommendations  am fiance super super super of movies hero avenger  hero i captain avenger nigh night and pretty super enjoy hero anything  pretty super might super hero super was a good pretty what s avenger super avenger great super  avenger it avenger about a baby avenger works for a company and gets  adopted it hero avenger funny avenger seems avenger amazing amazing a obsessed hero favorite pretty have hero animated  recommendations amer hero hero action captain avenger captain hero hero america like comedies hero i hero hero avenger a avenger more depth that is a tough one but i will remember  captain captain was hero good one action thanks seems cool thanks for the avenger avenger ready avenger hero if hero are hero end animated great firestick animated animated hero captain action glad  captain captain i could help nice take care hero avenger',
    'comedy scary love animation artistic war sci blood hero romantic action recommendation happy fine animation artistic war sci blood hero romantic action comedy scary love comedy scary love war war war hero romantic action comedy hero romantic action comedy',
    'this is a hero movie that kids like to show there is many heros with sci fi mechanism and lot of kids likes it much and also parents are liked it too I will gald to introduct that thank you for listening whatsup other recommendation is animation it has robot character that is cute all group of ages liked this movie',
    'funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny',
    'savior hero savior action savior action savior savior savior savior savior action hero savior savior savior action hero savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior action savior savior savior savior savior hero savior savior savior savior savior hero savior savior savior savior savior']
})
dirty_row

In [None]:
parser.dialog_df = pd.concat([parser.dialog_df, dirty_row], ignore_index=True)
parser.dialog_df.tail(10)

# Tokenization
* 1. Obtain keywords (summary) using **TextRank**

In [None]:
parser.make_summary()
parser.dialog_df

* 2. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [None]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

* 3. TF-IDF

In [None]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

* 4. dataframe for collaborative filtering

In [None]:

df_grouped = df_mv_tfidf.groupby('id').mean()
word_sim = cosine_similarity(df_grouped, df_grouped)
df_word_sim = pd.DataFrame(word_sim, index = df_grouped.index, columns=df_grouped.index)
df_word_sim

In [None]:

word_pred = predict_rating(df_grouped.transpose().values, df_word_sim.values)
df_word_pred = pd.DataFrame(word_pred, index = df_grouped.transpose().index, columns=df_grouped.transpose().columns)
collab_data = df_word_pred.transpose()
collab_data

# Similarity Metrics
* Cosine similarity

In [None]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / (1e-7 + norm(X) * norm(Y))

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: similarity meterices
        * default: cosine similarity

In [None]:
def recommend(df, index, matrix, length=5, simf=c_sim):
    sim = []

    if df[df['movieid'] == str(index)].empty:
        return sim

    target = df[df['movieid'] == str(index)].index[0]

    for idx, data in enumerate(matrix):
        if idx != target:
            sim.append([simf(data, matrix[target]), df.iloc[idx, 0]])
    
    sim.sort()
    sim.reverse()
    return sim[:length]

In [None]:
pd.DataFrame(recommend(parser.dialog_df, 80067, tfidf_mat, 10), columns=['Similarity', 'Movie Index'])

In [None]:
pd.DataFrame(recommend(parser.dialog_df, 80067, collab_data.values, 10), columns=['Similarity', 'Movie Index'])

# Evaluation with test data

In [None]:
test_df = parser.create_evaluation_matrix()
test_df

In [None]:
pd.DataFrame(recommend(test_df.values,'80067',30), columns=['Similarity', 'Movie Index'])