# Redial Parser
A separated library for parsing the redial dataset

In [1]:
!mkdir redial_parser

mkdir: redial_parser: File exists


class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [2]:
%%writefile redial_parser/__init__.py
# Note: Write down the parser class in the separated file, redial_parser.
# Since we want the word2vec notebook to contain the context of parser, we put the writefile to do that.

from copy import deepcopy

import json
import pandas


def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pandas.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe



class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None

    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)

    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result

    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')


Overwriting redial_parser/__init__.py


# NLTK - Natural Language toolkit

In [3]:
!python -m pip install nltk



Import a library for NLP.
- download('all'): Clean all the meaningless words (punch, stopwords, etc.)

In [4]:
# import nltk
# nltk.download('all')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [5]:
from redial_parser import RedialParser

# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [I like animations like 84779 and 191602], [84779, 191602]

In [6]:
# Special character removal
import re

compile = re.compile("\W+")
line3 = []  # line3: list of whole lines (global line data)
name = []  # list of all movie names (global movie name data)

parser = RedialParser('/Users/gimdongjun/MachineLearningTP/dataset')
parser.describe()

num = len(parser.train)
print(f'num: {num}')

# Note: limited the size of list to 10 because of the size error
#maximum_num = num
maximum_num = num

movienum = [[] for _ in range(maximum_num)]  # list of movie names (local movie name data)
line2 = [[] for _ in range(maximum_num)]  # list of lines (local line data)

for i in range(maximum_num):
    for msg in parser.train[i]['messages']:  # append line to the lists
        line2[i].append(msg["text"])
        line3.append(msg["text"])

    for idx, line in enumerate(line2[i]):
      a = compile.sub(" ", line)  # Clear special character
      line = line2[i][idx] = a.lower()  # lower character
      
      numbers = re.findall(r'\d+', line)
      for k in range(len(numbers)):
        if len(numbers[k]) >= 4:
          movienum[i].append(numbers[k])
          name.append(numbers[k])

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

num: 10006


Tokenization: Extract the terms related to the movies

1. Remove stopwords

In [7]:
# Library import
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

In [8]:
stop_word_eng = set(stopwords.words('english'))

for i in range(maximum_num):
  line2[i] = [j for j in line2[i] if j not in stop_word_eng]  # Clear stopwords

# Lemmatizer class
lemmatizer = WordNetLemmatizer()
token = RegexpTokenizer('[\w]+')

mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
x = pd.DataFrame(columns=['id'] + mv_tags)

for i in range(maximum_num):
  result_pre_lem = [token.tokenize(j) for j in line2[i]]
  middle_pre_lem = [r for j in result_pre_lem for r in j]
  final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

  # Lemmatization
  english = pd.Series(final_lem)
  for j in english:
    if j in mv_tags:
      for k in movienum[i]:
        if x[x['id'] == k].empty:
          new_row = pd.DataFrame({'id': [k]}, columns=x.columns)
          x = pd.concat([x, new_row], ignore_index=True)
          x.fillna(0, inplace=True)
          x.loc[x['id'] == k, j] += 1

x

Unnamed: 0,id,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
0,84779,0,0,0,1,0,0,0,0,0,0,0
1,191602,0,0,0,1,0,0,0,0,0,0,0
2,122159,0,0,0,1,0,0,0,0,0,0,0
3,165710,0,0,0,1,0,0,0,0,0,0,0
4,151313,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5170,205981,0,0,0,1,0,0,0,0,0,0,0
5171,106113,0,0,0,0,0,0,1,0,0,0,0
5172,96852,0,0,0,0,0,0,0,0,0,0,1
5173,112404,0,0,0,0,0,0,0,0,0,0,1


# TF-IDF
Calculate the importance of terms in documents

1. Construct dataframe with [movieid - all terms in dialog] relationship

In [9]:
df = pd.DataFrame(columns=["movieid", "dialog"])

for i in range(maximum_num):
  dig = ''
  for line in line2[i]:  # concatenate all sentences in related message dialog
    dig += str(line)
  
  for mv in movienum[i]:
    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=df.columns)
    df = pd.concat([df, newrow], ignore_index=True)
    
df

Unnamed: 0,movieid,dialog
0,84779,hi there how are you i m looking for movie rec...
1,191602,hi there how are you i m looking for movie rec...
2,122159,hi there how are you i m looking for movie rec...
3,165710,hi there how are you i m looking for movie rec...
4,151313,hi there how are you i m looking for movie rec...
...,...,...
65013,204974,what type of movies do you like hi i m looking...
65014,85036,hello hihow can i help youso some of the movie...
65015,170277,hello hihow can i help youso some of the movie...
65016,149938,hello hihow can i help youso some of the movie...


In [10]:
# Fill NaN with empty sentence
df['dialog'].fillna('', inplace=True)
df['dialog']

0        hi there how are you i m looking for movie rec...
1        hi there how are you i m looking for movie rec...
2        hi there how are you i m looking for movie rec...
3        hi there how are you i m looking for movie rec...
4        hi there how are you i m looking for movie rec...
                               ...                        
65013    what type of movies do you like hi i m looking...
65014    hello hihow can i help youso some of the movie...
65015    hello hihow can i help youso some of the movie...
65016    hello hihow can i help youso some of the movie...
65017    hello hihow can i help youso some of the movie...
Name: dialog, Length: 65018, dtype: object

2. Make a TF-IDFs matrix

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Vectorizer class
tfidf = TfidfVectorizer(stop_words='english', min_df=0.2)  # Ignore English Stopwords

# Obtain matrix
tfidf_mat = tfidf.fit_transform(df['dialog']).toarray()

# Construct dataset with id + word vectors
cdata = np.concatenate((df['movieid'].to_numpy().reshape(len(df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf.get_feature_names_out().tolist())
df_mv_tfidf

Unnamed: 0,id,bye,check,day,did,enjoy,good,great,haven,heard,...,recommend,saw,seen,suggestions,thank,thanks,think,ve,watch,yes
0,84779,0.198274,0.0,0.0,0.0,0.244056,0.223147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183523,0.0,0.0,0.0,0.0
1,191602,0.198274,0.0,0.0,0.0,0.244056,0.223147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183523,0.0,0.0,0.0,0.0
2,122159,0.198274,0.0,0.0,0.0,0.244056,0.223147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183523,0.0,0.0,0.0,0.0
3,165710,0.198274,0.0,0.0,0.0,0.244056,0.223147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183523,0.0,0.0,0.0,0.0
4,151313,0.198274,0.0,0.0,0.0,0.244056,0.223147,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.183523,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65013,204974,0.0,0.0,0.0,0.175039,0.0,0.240165,0.0,0.136755,0.0,...,0.0,0.176155,0.082112,0.307302,0.0,0.131679,0.0,0.0,0.136393,0.0
65014,85036,0.0,0.0,0.417921,0.0,0.156197,0.214223,0.091721,0.121983,0.0,...,0.0,0.157127,0.073242,0.0,0.407384,0.0,0.376058,0.0,0.364981,0.0
65015,170277,0.0,0.0,0.417921,0.0,0.156197,0.214223,0.091721,0.121983,0.0,...,0.0,0.157127,0.073242,0.0,0.407384,0.0,0.376058,0.0,0.364981,0.0
65016,149938,0.0,0.0,0.417921,0.0,0.156197,0.214223,0.091721,0.121983,0.0,...,0.0,0.157127,0.073242,0.0,0.407384,0.0,0.376058,0.0,0.364981,0.0


In [13]:
print(df_mv_tfidf.columns)

Index(['id', 'bye', 'check', 'day', 'did', 'enjoy', 'good', 'great', 'haven',
       'heard', 'hello', 'help', 'hi', 'hope', 'kind', 'know', 'like', 'liked',
       'll', 'looking', 'love', 'loved', 'movie', 'movies', 'really',
       'recommend', 'saw', 'seen', 'suggestions', 'thank', 'thanks', 'think',
       've', 'watch', 'yes'],
      dtype='object')


In [14]:
df_mv_tfidf.iloc[0,:]

id                84779
bye            0.198274
check               0.0
day                 0.0
did                 0.0
enjoy          0.244056
good           0.223147
great               0.0
haven               0.0
heard               0.0
hello               0.0
help                0.0
hi              0.18647
hope                0.0
kind            0.21227
know                0.0
like            0.78617
liked               0.0
ll                  0.0
looking         0.17163
love                0.0
loved               0.0
movie           0.27069
movies         0.130485
really              0.0
recommend           0.0
saw                 0.0
seen                0.0
suggestions         0.0
thank               0.0
thanks         0.183523
think               0.0
ve                  0.0
watch               0.0
yes                 0.0
Name: 0, dtype: object

In [15]:
# 데이터 프레임을 변경해야 될 것 같음, 지금 데이터 프레임을 transpose하면 안된다
# https://techblog-history-younghunjo1.tistory.com/116
# 위 블로그를 참고해서 변경하자
# 우선 영화별로 tf-id 그룹핑
df_grouped = df_mv_tfidf.groupby('id').mean()
df_grouped

Unnamed: 0_level_0,bye,check,day,did,enjoy,good,great,haven,heard,hello,...,recommend,saw,seen,suggestions,thank,thanks,think,ve,watch,yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0104807,0.166044,0.000000,0.000000,0.408596,0.000000,0.280311,0.000000,0.000000,0.195814,0.000000,...,0.000000,0.205600,0.000000,0.179335,0.000000,0.461071,0.328047,0.000000,0.000000,0.170997
1000,0.024181,0.212646,0.079637,0.029752,0.000000,0.160218,0.093477,0.119078,0.123998,0.125702,...,0.028736,0.029941,0.306244,0.000000,0.169466,0.000000,0.090161,0.091957,0.115915,0.241468
100026,0.151635,0.149872,0.000000,0.186570,0.000000,0.426643,0.109602,0.291527,0.178821,0.153871,...,0.000000,0.375517,0.175042,0.163773,0.162268,0.000000,0.000000,0.000000,0.290756,0.000000
100030,0.055407,0.057767,0.099412,0.066266,0.047732,0.313640,0.139210,0.095911,0.064867,0.058756,...,0.068676,0.036386,0.183518,0.100501,0.057128,0.074413,0.057178,0.022839,0.096594,0.071554
100043,0.135243,0.000000,0.108377,0.144812,0.054967,0.387337,0.220982,0.051973,0.105325,0.082373,...,0.053068,0.122241,0.235002,0.000000,0.057858,0.083847,0.080188,0.041478,0.051835,0.093289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.068484,0.077972,0.078895,0.066677,0.028353,0.233398,0.120542,0.083755,0.059995,0.105850,...,0.062958,0.046926,0.224293,0.071120,0.047272,0.093565,0.106648,0.084666,0.075178,0.082360
99955,0.174969,0.180397,0.192081,0.000000,0.000000,0.205415,0.000000,0.000000,0.000000,0.000000,...,0.216902,0.000000,0.206335,0.000000,0.000000,0.161952,0.000000,0.000000,0.000000,0.000000
99966,0.103250,0.000000,0.000000,0.000000,0.000000,0.228472,0.218835,0.097012,0.000000,0.000000,...,0.000000,0.000000,0.116499,0.111514,0.110489,0.000000,0.000000,0.000000,0.096756,0.103931
99975,0.295107,0.145838,0.000000,0.000000,0.000000,0.415159,0.106651,0.283680,0.348016,0.000000,...,0.175350,0.000000,0.170330,0.000000,0.000000,0.136575,0.145758,0.000000,0.000000,0.000000


row가 6311개로 감소한 것을 확인 할 수 있습니다. 각 컬럼 값은 각 평균 값으로 합쳐졌습니다.

In [16]:
favor_col = ['enjoy', 'good', 'great','like', 'liked','love', 'loved','recommend','suggestions']

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

word_sim = cosine_similarity(df_grouped, df_grouped)
df_word_sim = pd.DataFrame(word_sim, index = df_grouped.index, columns=df_grouped.index)
df_word_sim

id,0104807,1000,100026,100030,100043,100070,100074,100106,100165,100178,...,99809,99812,99824,99887,99896,99910,99955,99966,99975,99998
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0104807,1.000000,0.490322,0.505161,0.588107,0.559630,0.597002,0.245053,0.565411,0.234531,0.531345,...,0.551106,0.195945,0.414124,0.416001,0.643700,0.644824,0.375656,0.247013,0.549524,0.520652
1000,0.490322,1.000000,0.514527,0.757324,0.604548,0.805039,0.486424,0.604776,0.440788,0.631587,...,0.713208,0.579117,0.498381,0.599368,0.840864,0.876839,0.647498,0.455872,0.604381,0.545038
100026,0.505161,0.514527,1.000000,0.706399,0.703771,0.627103,0.422338,0.704274,0.396074,0.458058,...,0.639047,0.455666,0.355907,0.680402,0.662055,0.657381,0.377880,0.419437,0.647451,0.527934
100030,0.588107,0.757324,0.706399,1.000000,0.875844,0.881791,0.589227,0.699875,0.541173,0.772845,...,0.820708,0.597180,0.687088,0.773864,0.918302,0.946464,0.665634,0.713804,0.773232,0.558631
100043,0.559630,0.604548,0.703771,0.875844,1.000000,0.704115,0.452371,0.675797,0.579099,0.683312,...,0.733105,0.515025,0.519787,0.684666,0.804357,0.825630,0.547735,0.625198,0.695827,0.479755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.644824,0.876839,0.657381,0.946464,0.825630,0.913149,0.577033,0.723025,0.538627,0.761116,...,0.839207,0.586364,0.677236,0.724868,0.962434,1.000000,0.709384,0.628625,0.764047,0.558634
99955,0.375656,0.647498,0.377880,0.665634,0.547735,0.682649,0.445091,0.566404,0.382931,0.580718,...,0.670102,0.437906,0.533221,0.535804,0.733398,0.709384,1.000000,0.415360,0.654425,0.345944
99966,0.247013,0.455872,0.419437,0.713804,0.625198,0.542590,0.277880,0.471678,0.374122,0.802528,...,0.492426,0.294270,0.639714,0.735600,0.660884,0.628625,0.415360,1.000000,0.593664,0.299888
99975,0.549524,0.604381,0.647451,0.773232,0.695827,0.705064,0.437532,0.729970,0.377786,0.610428,...,0.680450,0.494739,0.522480,0.634413,0.757762,0.764047,0.654425,0.593664,1.000000,0.383732


영화들간의 코사인 유사도를 계산하였습니다.

In [18]:
def predict_rating(grouped_arr, word_sim_arr):
    ratings_pred = grouped_arr.dot(word_sim_arr) / np.array([np.abs(word_sim_arr).sum(axis=1)])
    return ratings_pred

word_pred = predict_rating(df_grouped.transpose().values, df_word_sim.values)
df_word_pred = pd.DataFrame(word_pred, index = df_grouped.transpose().index, columns=df_grouped.transpose().columns)
df_word_pred

id,0104807,1000,100026,100030,100043,100070,100074,100106,100165,100178,...,99809,99812,99824,99887,99896,99910,99955,99966,99975,99998
bye,0.083689,0.07991,0.082787,0.080793,0.082266,0.081221,0.079191,0.088504,0.079324,0.082861,...,0.083496,0.082364,0.08409,0.080324,0.081488,0.081148,0.083424,0.083187,0.084586,0.08322
check,0.084713,0.090644,0.088666,0.086869,0.085038,0.087435,0.091634,0.08801,0.085868,0.089288,...,0.086496,0.091117,0.086218,0.086828,0.086211,0.087545,0.091965,0.083624,0.088935,0.08373
day,0.063981,0.066039,0.06458,0.067171,0.067424,0.066501,0.073608,0.06519,0.067399,0.068081,...,0.0658,0.069392,0.064942,0.064707,0.066478,0.066588,0.067914,0.066502,0.064624,0.068875
did,0.06901,0.059463,0.062672,0.060097,0.061466,0.060028,0.057763,0.061228,0.057161,0.061849,...,0.059801,0.058177,0.058085,0.059492,0.060502,0.060208,0.057867,0.058221,0.058736,0.062416
enjoy,0.056569,0.05639,0.056095,0.057481,0.057828,0.057457,0.065243,0.057168,0.057072,0.056873,...,0.059388,0.061095,0.063313,0.05634,0.057703,0.057127,0.056057,0.05665,0.05614,0.056999
good,0.241609,0.234526,0.244435,0.240639,0.242084,0.237324,0.231583,0.239703,0.230516,0.232578,...,0.23705,0.22867,0.239231,0.244478,0.236255,0.237589,0.238119,0.236497,0.241506,0.232954
great,0.137276,0.140868,0.141685,0.143314,0.145994,0.14205,0.15328,0.144915,0.162648,0.146934,...,0.141895,0.149373,0.138873,0.13809,0.143143,0.142364,0.138991,0.146593,0.140049,0.147788
haven,0.085307,0.091002,0.093368,0.08908,0.087781,0.087026,0.096032,0.090301,0.092396,0.086839,...,0.086367,0.098713,0.087703,0.088089,0.088367,0.089353,0.087935,0.086964,0.092091,0.084955
heard,0.066156,0.064463,0.066382,0.06357,0.064001,0.063581,0.062324,0.067102,0.06179,0.062975,...,0.063508,0.066844,0.062077,0.062604,0.062917,0.063392,0.061972,0.061388,0.067995,0.069783
hello,0.07178,0.07487,0.075675,0.073843,0.074234,0.074771,0.076971,0.073423,0.073849,0.073864,...,0.073567,0.074179,0.074357,0.072527,0.073683,0.07453,0.07312,0.072764,0.072745,0.073983


구한 코사인 유사도와 df_grouped데이터를 내적하여서, 단어와 영화 사이 예측 값을 구하였습니다.

In [19]:
# 위형식을 가지고, 데이터를 살짝 변경해서 만들어보면 될듯?
df_word_pred.transpose()

Unnamed: 0_level_0,bye,check,day,did,enjoy,good,great,haven,heard,hello,...,recommend,saw,seen,suggestions,thank,thanks,think,ve,watch,yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0104807,0.083689,0.084713,0.063981,0.069010,0.056569,0.241609,0.137276,0.085307,0.066156,0.071780,...,0.063207,0.061073,0.200570,0.069231,0.068285,0.092376,0.099975,0.073811,0.091920,0.081464
1000,0.079910,0.090644,0.066039,0.059463,0.056390,0.234526,0.140868,0.091002,0.064463,0.074870,...,0.064479,0.054875,0.209690,0.065972,0.073163,0.084424,0.093525,0.076842,0.094397,0.081432
100026,0.082787,0.088666,0.064580,0.062672,0.056095,0.244435,0.141685,0.093368,0.066382,0.075675,...,0.064442,0.063082,0.205850,0.069209,0.074600,0.084489,0.091946,0.073832,0.099178,0.078356
100030,0.080793,0.086869,0.067171,0.060097,0.057481,0.240639,0.143314,0.089080,0.063570,0.073843,...,0.065154,0.055555,0.206133,0.067995,0.071646,0.086051,0.093073,0.074381,0.094663,0.079103
100043,0.082266,0.085038,0.067424,0.061466,0.057828,0.242084,0.145994,0.087781,0.064001,0.074234,...,0.064895,0.057539,0.206540,0.066600,0.072001,0.085936,0.093730,0.074623,0.093892,0.080106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99910,0.081148,0.087545,0.066588,0.060208,0.057127,0.237589,0.142364,0.089353,0.063392,0.074530,...,0.065085,0.055600,0.207858,0.067316,0.071182,0.086451,0.094304,0.076322,0.094059,0.079098
99955,0.083424,0.091965,0.067914,0.057867,0.056057,0.238119,0.138991,0.087935,0.061972,0.073120,...,0.070511,0.054224,0.206187,0.065754,0.069530,0.088838,0.091256,0.074535,0.091990,0.076338
99966,0.083187,0.083624,0.066502,0.058221,0.056650,0.236497,0.146593,0.086964,0.061388,0.072764,...,0.064351,0.053604,0.203577,0.068595,0.073208,0.084340,0.090837,0.072069,0.096035,0.080115
99975,0.084586,0.088935,0.064624,0.058736,0.056140,0.241506,0.140049,0.092091,0.067995,0.072745,...,0.067786,0.055222,0.205796,0.066765,0.070461,0.086949,0.093875,0.074300,0.091927,0.076961


transpose를 취하면 다음과 같은 데이터프레임을 얻을수 있습니다.

그 다음, 영화를 입력 받으면, 위 데이터프레임과의 유사도를 확인한 다음 상위 5개를 출력하는 방식으로 구현하였는데, 이렇게 하는게 맞을까요???


# Cosine Simliarity
Compute the similarity between words for recommendation

In [20]:
from numpy.linalg import norm

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: consine similarity function
        * default: dot(X, y) / (normalize(X) * normalize(Y) + 1e-7)

In [21]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
def recommend(data, mv, length=5, simf=lambda X, Y: np.dot(X,Y)/((norm(X)*norm(Y)) + 1e-7)):
    sim = []

    if df.loc[df['movieid'] == mv].empty:
        return sim
    
    idx = df[df['movieid'] == mv].index.values[0]

    for i in range(len(data)):
        if idx != i:
            sim.append((simf(data[i], data[idx]), df.loc[i]['movieid']))
    
    sim.sort()
    sim.reverse()
    return sim[:length]

# Example
Try a recommendation with movie id 80067 (= Toy Story (1995))

In [22]:
pd.DataFrame(recommend(tfidf_mat, '80067', 10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,81792
1,1.0,182731
2,1.0,154844
3,1.0,133249
4,0.773555,81385
5,0.773555,2000
6,0.773555,199831
7,0.773555,199831
8,0.773555,189328
9,0.773555,177112
