# Libraries

In [10]:
# code for importing google drive in colab
# from google.colab import drive
# drive.mount('/content/drive')
# PATH = "/content/drive/MyDrive/MachineLearningTP/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
# Basic libraries
import numpy as np
import pandas as pd

import itertools

# Dataset imports
import json

# For restoring the dataset
from copy import deepcopy

# Text manipulations
import re

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler

# Cosine similarity
from numpy.linalg import norm

# File download (Golve)
from urllib.request import urlretrieve
import zipfile

# Library installation
* NLTK - Natural Language toolkit
* NetworkX - Structure, Dynamics, and Functions of complex networks Library

In [13]:
!python -m pip install nltk
!python -m pip install networkx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


NLTK: Library for NLP Process
* Usage:
    * nltk.corpus.**stopwords**: stopwords of specific language
    * nltk.tokenize.**RegexpTokenizer**: Tokenize the input sentences
    * nltk.stem.**WordNetLemmatizer**: Lemmatize the word net

In [14]:
import nltk
nltk.download('all')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloadin

# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [15]:
def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pd.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe

def predict_rating(grouped_arr, word_sim_arr):
    ratings_pred = grouped_arr.dot(word_sim_arr) / np.array([np.abs(word_sim_arr).sum(axis=1)])
    return ratings_pred

class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self._global_movie_list = None  # list of all movie names (global movie name data)
        self._global_msg_list = None  # list of whole lines (global line data)
        self._local_movie_list = None  # list of movie names (local movie name data)
        self._local_msg_list = None  # list of lines (local line data)

        self.dialog_df = None  # Sum of dialogs for each movie indices

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None


    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)


    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result


    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')
    

    def preprocessing(self):
        """
        TODO: Regroup train dataset into purposed structure and clean up data
        """
        compile = re.compile("\W+")  # Format
        
        ran = range(len(self.train))

        # initialize list
        self._global_movie_list = []
        self._global_msg_list = []
        self._local_movie_list = [[] for _ in ran]
        self._local_msg_list = [[] for _ in ran]

        for i, data in enumerate(self.train):
            for msg in data['messages']:  # append line to the lists
                self._local_msg_list[i].append(msg['text'])
                self._global_msg_list.append(msg['text'])

            # Extract movie indices
            for idx, line in enumerate(self._local_msg_list[i]):
                numbers = re.findall(r'@\d+', line)  # find number keywords (ex: @12345)
                for number in numbers:
                    self._local_movie_list[i].append(number[1:])
                    self._global_movie_list.append(number[1:])

                    # Remove index string
                    pos = line.index(number)
                    line = self._local_msg_list[i][idx] = line[0: pos] + line[pos + len(number): len(line)]

                # Post: clear meaningless words
                a = compile.sub(" ", line)  # Clear special character
                self._local_msg_list[i][idx] = a.lower()  # lower character

        # Construct dialog dataframe
        self.dialog_df = pd.DataFrame(columns=["movieid", "dialog"])

        for lines, movies in zip(self._local_msg_list, self._local_movie_list):
            dig = ''
            for line in lines:  # concatenate all sentences in related message dialog
                dig += ' ' + str(line)
            
            for mv in movies:
                if self.dialog_df[self.dialog_df['movieid'] == mv].empty:
                    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=self.dialog_df.columns)
                    self.dialog_df = pd.concat([self.dialog_df, newrow], ignore_index=True)
                else:
                    target = self.dialog_df[self.dialog_df['movieid'] == mv].index[0]
                    self.dialog_df.iloc[target, 1] = self.dialog_df.iloc[target, 1] + ' ' + dig
        
        # Drop NaN with empty sentence
        self.dialog_df['dialog'].dropna(how='any', inplace=True)
        

    def get_frequency_matrix(self, tags):
        """
        TODO: compute the frequency of tag words to obtain the TF-IDFs matrix

            :arg
                tags (list): list of key words.
            :return
                pandas.DataFrame: frequency matrix of tag words.
        """
        stop_word_eng = set(stopwords.words('english'))
        ran = range(len(self.train))

        msg_list = deepcopy(self._local_msg_list)

        for i in ran:
            msg_list[i] = [j for j in msg_list[i] if j not in stop_word_eng]  # Clear stopwords

        # Lemmatizer class
        lemmatizer = WordNetLemmatizer()
        token = RegexpTokenizer('[\w]+')

        x = pd.DataFrame(columns=['id'] + tags)

        for idx, msg in enumerate(msg_list):
            result_pre_lem = [token.tokenize(j) for j in msg]
            middle_pre_lem = [r for j in result_pre_lem for r in j]
            final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

            # Lemmatization
            english = pd.Series(final_lem)
            for word in english:
                if word in tags:
                    for movie in self._local_movie_list[idx]:
                        if x[x['id'] == movie].empty:
                            new_row = pd.DataFrame({'id': [movie]}, columns=x.columns)
                            x = pd.concat([x, new_row], ignore_index=True)
                            x.fillna(0, inplace=True)
                        x.loc[x['id'] == movie, word] += 1

        return x
    
    def get_tfidf_matrix(self, **tfidf_keys):
        """
        TODO: Compute TF-IDFs matrix

            :arg
                tfidf_keys(keyword dict): TfidfVectorizer parameters
            :return
                numpy.ndrarry: TF-IDFs matrix
                numpy.ndarray: feature name of TF-IDFs (word)
        """
        # Vectorizer class
        tfidf = TfidfVectorizer(**tfidf_keys)  # Ignore English Stopwords

        # Obtain matrix
        tfidf_df = tfidf.fit_transform(self.dialog_df['dialog'])

        return tfidf_df.toarray(), tfidf.get_feature_names_out()
    def create_evaluation_matrix(self):
        result = []
        for dialog in self.train:
          for respondent in dialog['respondentQuestions']:
            result.append([respondent,
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['seen']),
                        float(dialog['respondentQuestions'][respondent]['suggested']+dialog['respondentQuestions'][respondent]['seen']+dialog['respondentQuestions'][respondent]['liked'])])
          for intq in dialog['initiatorQuestions']:
            result.append([intq,
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['seen']),
                       float(dialog['initiatorQuestions'][intq]['suggested']+dialog['initiatorQuestions'][intq]['seen']+dialog['initiatorQuestions'][intq]['liked'])])

        test_data = pd.DataFrame(result, columns=["movie_id","suggested","seen","liked"] )
        print(test_data.values[0])
        grouped_data = test_data.groupby('movie_id').mean()
        scaler = StandardScaler()
        grouped_data = pd.DataFrame(scaler.fit_transform(grouped_data), columns = ["suggested","seen","liked"])
        print(grouped_data.values[0])
        reformat = []
        for data in grouped_data.values:
          reformat.append([data[0]*data[1], data[1], data[1]*data[2]])

        test_grouped = pd.DataFrame(reformat, columns= ["suggested","seen","liked"], index = grouped_data.index)

        eval_sim = cosine_similarity(test_grouped, test_grouped)
        test_sim = pd.DataFrame(eval_sim, index = test_grouped.index, columns=test_grouped.index)

        test_pred = predict_rating(test_grouped.transpose().values, test_sim.values)
        test_pred = pd.DataFrame(test_pred, index = test_grouped.transpose().index, columns=test_grouped.transpose().columns)
        test_data = test_pred.transpose()

        return test_data


# Initialize
Import dataset, describe it briefly.

In [16]:
parser = RedialParser('../dataset')
# parser = RedialParser(PATH+'dataset')
parser.describe()  # Describe read dataset

# Size of train data
num = len(parser.train)
print(f'length of train dataset: {num}')

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

length of train dataset: 10006


# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [i like animations like  and ], [84779, 191602]


Specific:
* Transform dataset structure.
    * Original: [movieMentions, {messages}, conversationId, ...]
    * Transformed: [movie_indices], [message_contexts], [[1st_movie_index], [2nd_...], ...], [[1st_message_context], [2nd_...], ...]
    * Dialog Dataframe (*self.dialog_df*): {'movie_id': '1st message' + '2nd message' + ...} - Used in generation of **TF-IDF** matrix
* Recognize movie indices
    * **@** recognition: use re library's *findall(@\d+)* function, it only detects '@' + index strings.
* Clean up meaningless values
    * Special characters: use re library's format *\w+*, it only receives widechar characters.
    * Movie index: remove context of them by using text slicing.

In [17]:
parser.preprocessing()
parser.dialog_df

Unnamed: 0,movieid,dialog
0,84779,hi there how are you i m looking for movie re...
1,191602,hi there how are you i m looking for movie re...
2,122159,hi there how are you i m looking for movie re...
3,165710,hi there how are you i m looking for movie re...
4,151313,hi there how are you i m looking for movie re...
...,...,...
6217,166377,hi hello there i like sci fi genetic modifica...
6218,205981,what kind of movies do you like hello i am l...
6219,106113,hi hi i like sci fi movies genetic modificati...
6220,96852,hi hi have a good day which kind of movie do ...


# Tokenization
* 1. Extract words and their counts related to the movies. (Did not used, only for eye inspection.)

In [18]:
# Tag words words related with movie genres
mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
frequency = parser.get_frequency_matrix(mv_tags)
frequency.describe()

Unnamed: 0,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
count,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0,5101.0
mean,5.840227,1.32523,7.919035,0.204666,0.006469,0.583023,0.990982,0.059792,0.304058,1.089982,3.916095
std,21.739531,10.106137,20.996916,1.584999,0.091594,4.267986,6.156864,0.447819,2.396133,6.209539,19.184717
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,449.0,494.0,380.0,64.0,2.0,175.0,160.0,15.0,81.0,153.0,673.0


* 2. Normal TF-IDF

In [19]:
tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf

Unnamed: 0,id,action,actually,amazing,awesome,best,better,bit,bye,care,...,type,ve,want,watch,watched,watching,welcome,wow,yeah,yes
0,84779,0.152357,0.0,0.0,0.0,0.0,0.0,0.186844,0.097666,0.190202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,191602,0.152357,0.0,0.0,0.0,0.0,0.0,0.186844,0.097666,0.190202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,122159,0.152357,0.0,0.0,0.0,0.0,0.0,0.186844,0.097666,0.190202,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,165710,0.031588,0.042188,0.026642,0.063162,0.021967,0.011711,0.014899,0.146415,0.027301,...,0.03277,0.066668,0.061108,0.156519,0.136781,0.062132,0.054333,0.026951,0.049656,0.191625
4,151313,0.066975,0.0,0.0,0.10044,0.037844,0.0,0.041067,0.085866,0.083611,...,0.0,0.0,0.0,0.093787,0.0,0.214071,0.0,0.123809,0.0,0.135432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6217,166377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.127951,0.0,...,0.0,0.0,0.193067,0.0,0.0,0.0,0.0,0.0,0.0,0.13454
6218,205981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.244901,0.333722,0.0,0.0,0.0,0.0,0.0
6219,106113,0.183991,0.0,0.0,0.0,0.0,0.0,0.0,0.117944,0.0,...,0.0,0.0,0.355936,0.0,0.0,0.0,0.0,0.22675,0.0,0.0
6220,96852,0.146467,0.0,0.0,0.0,0.0,0.0,0.0,0.09389,0.0,...,0.0,0.0,0.0,0.205103,0.0,0.0,0.0,0.0,0.0,0.098725


* 3. dataframe for collaborative filtering

In [20]:
# use tf_idf matrix for collaborative filtering
df_grouped = df_mv_tfidf.groupby('id').mean()
movie_sim = cosine_similarity(df_grouped, df_grouped)
df_movie_sim = pd.DataFrame(movie_sim, index = df_grouped.index, columns=df_grouped.index)
df_movie_sim

id,100026,100030,100043,100070,100074,100106,100165,100178,100183,100228,...,99809,99812,99824,99887,99896,99910,99955,99966,99975,99998
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,1.000000,0.542241,0.608078,0.546761,0.280370,0.531466,0.278996,0.377040,0.479374,0.225390,...,0.462399,0.271210,0.316781,0.481559,0.555027,0.524365,0.368573,0.304782,0.451626,0.434941
100030,0.542241,1.000000,0.776461,0.761579,0.486749,0.584837,0.449016,0.677780,0.619149,0.532805,...,0.677126,0.490264,0.577015,0.676938,0.850366,0.928482,0.495434,0.588501,0.623253,0.529665
100043,0.608078,0.776461,1.000000,0.660730,0.409561,0.555217,0.391251,0.613808,0.575934,0.307208,...,0.625114,0.404932,0.485239,0.592039,0.792800,0.756323,0.501231,0.565624,0.638127,0.480658
100070,0.546761,0.761579,0.660730,1.000000,0.497457,0.545165,0.383571,0.626201,0.605376,0.349406,...,0.709200,0.410316,0.665660,0.553250,0.821410,0.764829,0.563847,0.444316,0.566155,0.599339
100074,0.280370,0.486749,0.409561,0.497457,1.000000,0.411078,0.301147,0.454412,0.485383,0.366595,...,0.465238,0.451006,0.379348,0.260708,0.535692,0.481762,0.327270,0.295278,0.450814,0.399467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.524365,0.928482,0.756323,0.764829,0.481762,0.570706,0.421607,0.664595,0.645416,0.498414,...,0.694208,0.441440,0.550559,0.666951,0.875881,1.000000,0.523886,0.544675,0.633356,0.507396
99955,0.368573,0.495434,0.501231,0.563847,0.327270,0.489430,0.246920,0.514505,0.374919,0.187601,...,0.613648,0.289429,0.542795,0.409143,0.644002,0.523886,1.000000,0.371817,0.460136,0.283680
99966,0.304782,0.588501,0.565624,0.444316,0.295278,0.469456,0.326581,0.581260,0.367791,0.184257,...,0.390121,0.308226,0.397777,0.529702,0.588715,0.544675,0.371817,1.000000,0.424264,0.357316
99975,0.451626,0.623253,0.638127,0.566155,0.450814,0.456449,0.248160,0.571214,0.520892,0.234212,...,0.586207,0.411532,0.368589,0.478956,0.663275,0.633356,0.460136,0.424264,1.000000,0.335713


In [21]:
# creating dataframe for collaborative filtering
movie_pred = predict_rating(df_grouped.transpose().values, df_movie_sim.values)
df_movie_pred = pd.DataFrame(movie_pred, index = df_grouped.transpose().index, columns=df_grouped.transpose().columns)
collab_data = df_movie_pred.transpose()
collab_data

Unnamed: 0_level_0,action,actually,amazing,awesome,best,better,bit,bye,care,chatting,...,type,ve,want,watch,watched,watching,welcome,wow,yeah,yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100026,0.052443,0.023127,0.016077,0.029671,0.019018,0.017308,0.016099,0.097509,0.014572,0.014165,...,0.031198,0.062071,0.034323,0.086231,0.037112,0.022843,0.032889,0.015537,0.044731,0.098666
100030,0.051889,0.023524,0.016111,0.029093,0.019240,0.017414,0.016103,0.096616,0.014618,0.014439,...,0.031506,0.062035,0.032782,0.084026,0.037226,0.023121,0.032669,0.015579,0.045503,0.100037
100043,0.053277,0.023397,0.016297,0.029253,0.019376,0.017614,0.016065,0.097119,0.014738,0.014719,...,0.031272,0.062519,0.032837,0.083787,0.037950,0.022946,0.032607,0.015968,0.045554,0.101353
100070,0.057501,0.023467,0.016254,0.029806,0.019171,0.017508,0.016146,0.096536,0.014692,0.014210,...,0.031502,0.061943,0.032800,0.084982,0.036947,0.022908,0.032663,0.015861,0.045569,0.098418
100074,0.052510,0.023710,0.016507,0.030935,0.018875,0.017115,0.016295,0.095429,0.014744,0.014427,...,0.031134,0.063066,0.032430,0.084861,0.038979,0.024404,0.032556,0.016970,0.046820,0.099070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.052081,0.023661,0.015979,0.029094,0.019487,0.017511,0.016231,0.096474,0.014712,0.014545,...,0.031342,0.062913,0.033087,0.083834,0.037139,0.022979,0.032566,0.015694,0.045527,0.099432
99955,0.059229,0.023179,0.017115,0.029016,0.019585,0.017378,0.015894,0.101775,0.014502,0.014563,...,0.030901,0.061571,0.032990,0.083334,0.038346,0.022907,0.033534,0.015397,0.046671,0.096908
99966,0.052616,0.024195,0.016776,0.029677,0.019914,0.017674,0.015884,0.098587,0.014701,0.015316,...,0.030902,0.061340,0.032370,0.085630,0.040491,0.023497,0.032523,0.016633,0.045587,0.100815
99975,0.053054,0.023070,0.016110,0.028553,0.019298,0.017285,0.016065,0.098684,0.014255,0.014429,...,0.031144,0.062627,0.032632,0.082849,0.036735,0.022656,0.032463,0.015743,0.045071,0.100296


# Similarity Metrics
* Cosine similarity

In [22]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
c_sim = lambda X, Y: np.dot(X, Y) / ((norm(X) * norm(Y)) + 1e-7)

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: Similarity meterices
        * default: consine similarity function

In [23]:
def recommend(df, index, matrix, length=5, simf=c_sim):
    sim = []

    if df[df['movieid'] == str(index)].empty:
        return sim

    target = df[df['movieid'] == str(index)].index[0]

    for idx, data in enumerate(matrix):
        if idx != target:
            sim.append([simf(data, matrix[target]), df.iloc[idx, 0]])
    
    sim.sort()
    sim.reverse()
    return sim[:length]

In [24]:
# result of contents filtering
df = pd.DataFrame(recommend(parser.dialog_df, 80067, tfidf_mat), columns=['Similarity', 'Movie Index'])
df

Unnamed: 0,Similarity,Movie Index
0,0.981758,140749
1,0.981445,159885
2,0.979242,81792
3,0.978515,122604
4,0.978132,182731


In [26]:
# result of collaborative filtering
pd.DataFrame(recommend(parser.dialog_df,80067, collab_data.values), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,98281
1,0.999885,112455
2,0.999882,198918
3,0.999882,166413
4,0.999881,106625


In [27]:
test_df = parser.create_evaluation_matrix()
test_df

['203371' 1.0 0.0 2.0]
[1.33668976 0.74041903 0.76857219]


Unnamed: 0,suggested,seen,liked
0,0.992990,0.419493,0.956719
1,-0.610773,0.695204,-0.535894
2,0.028872,0.702101,0.041611
3,0.634794,-0.687416,0.568101
4,0.690525,-0.694168,0.543834
...,...,...,...
6203,-0.011038,0.708648,0.025419
6204,-0.028872,-0.702101,-0.041611
6205,1.070435,0.312572,1.094999
6206,-0.690525,0.694168,-0.543834


In [28]:
pd.DataFrame(recommend(parser.dialog_df,80067,test_df.values), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,93091
1,1.0,92480
2,1.0,87094
3,1.0,86104
4,1.0,84561


# Dirty data test
* Since we couldn't find the ditry data, we test it (spam) with hand-written data.

In [29]:
dirty_row = pd.DataFrame({
    'movieid': ['999995', '999996', '999997', '999998', '999999'],
    'dialog': ['hi avenger right right hero care care thanks right hero hero hero help help pretty planning captain movie recommendations  am fiance super super super of movies hero avenger  hero i captain avenger nigh night and pretty super enjoy hero anything  pretty super might super hero super was a good pretty what s avenger super avenger great super  avenger it avenger about a baby avenger works for a company and gets  adopted it hero avenger funny avenger seems avenger amazing amazing a obsessed hero favorite pretty have hero animated  recommendations amer hero hero action captain avenger captain hero hero america like comedies hero i hero hero avenger a avenger more depth that is a tough one but i will remember  captain captain was hero good one action thanks seems cool thanks for the avenger avenger ready avenger hero if hero are hero end animated great firestick animated animated hero captain action glad  captain captain i could help nice take care hero avenger',
    'comedy scary love animation artistic war sci blood hero romantic action recommendation happy fine animation artistic war sci blood hero romantic action comedy scary love comedy scary love war war war hero romantic action comedy hero romantic action comedy',
    'this is a hero movie that kids like to show there is many heros with sci fi mechanism and lot of kids likes it much and also parents are liked it too I will gald to introduct that thank you for listening whatsup other recommendation is animation it has robot character that is cute all group of ages liked this movie',
    'funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny funny',
    'savior hero savior action savior action savior savior savior savior savior action hero savior savior savior action hero savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior savior action savior savior savior savior savior hero savior savior savior savior savior hero savior savior savior savior savior']
})
dirty_row

Unnamed: 0,movieid,dialog
0,999995,hi avenger right right hero care care thanks r...
1,999996,comedy scary love animation artistic war sci b...
2,999997,this is a hero movie that kids like to show th...
3,999998,funny funny funny funny funny funny funny funn...
4,999999,savior hero savior action savior action savior...


In [30]:
original = parser.dialog_df.copy()

In [31]:
parser.dialog_df = pd.concat([original, dirty_row], ignore_index=True)

tfidf_mat, tfidf_columns = parser.get_tfidf_matrix(stop_words='english', min_df=0.2)

# Construct dataset with id + word vectors
cdata = np.concatenate((parser.dialog_df['movieid'].to_numpy().reshape(len(parser.dialog_df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf_columns.tolist())
df_mv_tfidf[df_mv_tfidf['id'] == '999999']

Unnamed: 0,id,action,actually,amazing,awesome,best,better,bit,bye,care,...,type,ve,want,watch,watched,watching,welcome,wow,yeah,yes
6226,999999,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df = pd.DataFrame(recommend(parser.dialog_df, 999999, tfidf_mat), columns=['Similarity', 'Movie Index'])
df

Unnamed: 0,Similarity,Movie Index
0,0.688829,195791
1,0.688829,116774
2,0.679817,120270
3,0.647287,79205
4,0.606128,999996
