# Redial Parser
A separated library for parsing the redial dataset

In [1]:
!mkdir redial_parser

���� ���͸� �Ǵ� ���� redial_parser��(��) �̹� �ֽ��ϴ�.


class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [2]:
%%writefile redial_parser/__init__.py
# Note: Write down the parser class in the separated file, redial_parser.
# Since we want the word2vec notebook to contain the context of parser, we put the writefile to do that.

from copy import deepcopy

import json
import pandas


def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pandas.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe



class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None

    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)

    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result

    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')


Overwriting redial_parser/__init__.py


# NLTK - Natural Language toolkit

In [3]:
!python -m pip install nltk



Import a library for NLP.
- download('all'): Clean all the meaningless words (punch, stopwords, etc.)

In [4]:
import nltk
nltk.download('all')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    | 

True

In [5]:
from redial_parser import RedialParser

# Preprocessing
Clear the special character and extract the text and movie indices
- example: "I like animations like @84779 and @191602" → [I like animations like 84779 and 191602], [84779, 191602]

In [6]:
# Special character removal
import re

compile = re.compile("\W+")
line3 = []  # line3: list of whole lines (global line data)
name = []  # list of all movie names (global movie name data)

parser = RedialParser('../dataset')
parser.describe()

num = len(parser.train)
print(f'num: {num}')

# Note: limited the size of list to 10 because of the size error
#maximum_num = num
maximum_num = 10

movienum = [[] for _ in range(maximum_num)]  # list of movie names (local movie name data)
line2 = [[] for _ in range(maximum_num)]  # list of lines (local line data)

for i in range(maximum_num):
    for msg in parser.train[i]['messages']:  # append line to the lists
        line2[i].append(msg["text"])
        line3.append(msg["text"])

    for idx, line in enumerate(line2[i]):
      a = compile.sub(" ", line)  # Clear special character
      line = line2[i][idx] = a.lower()  # lower character
      
      numbers = re.findall(r'\d+', line)
      for k in range(len(numbers)):
        if len(numbers[k]) >= 4:
          movienum[i].append(numbers[k])
          name.append(numbers[k])

Brief information:
Length of train data: 10006
Length of test data: 1342

Data information:
Key parameters: ['movieMentions', 'respondentQuestions', 'messages', 'conversationId', 'respondentWorkerId', 'initiatorWorkerId', 'initiatorQuestions']
Key parameters in Questions: ['suggested', 'seen', 'liked']
Key parameters in messages: ['timeOffset', 'text', 'senderWorkerId', 'messageId']

Context information:
Total mentioned movie number (train): 52918
Total mentioned movie number in unique (train): 6223
Total message number (train): 182150
Total mentioned movie number (test): 7154
Total mentioned movie number in unique (test): 2007
Total message number (test): 23952
Average mentioned movie numbers per conversation (train): 5.288626823905656
Average message numbers per conversation (train): 18.20407755346792
Average mentioned movie numbers per conversation (test): 5.330849478390462
Average message numbers per conversation (test): 17.847988077496275

num: 10006


Tokenization: Extract the terms related to the movies

1. Remove stopwords

In [7]:
# Library import
import nltk
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

2. Construct the word frequency matrix

In [8]:
stop_word_eng = set(stopwords.words('english'))

for i in range(maximum_num):
  line2[i] = [j for j in line2[i] if j not in stop_word_eng]  # Clear stopwords

# Lemmatizer class
lemmatizer = WordNetLemmatizer()
token = RegexpTokenizer('[\w]+')

mv_tags = ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']
x = pd.DataFrame(columns=['id'] + mv_tags)

for i in range(maximum_num):
  result_pre_lem = [token.tokenize(j) for j in line2[i]]
  middle_pre_lem = [r for j in result_pre_lem for r in j]
  final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng]  # Remove stopword

  # Lemmatization
  english = pd.Series(final_lem)
  for j in english:
    if j in mv_tags:
      for k in movienum[i]:
        if x[x['id'] == k].empty:
          new_row = pd.DataFrame({'id': [k]}, columns=x.columns)
          x = pd.concat([x, new_row], ignore_index=True)
          x.fillna(0, inplace=True)
          x.loc[x['id'] == k, j] += 1

x

Unnamed: 0,id,comedy,scary,love,animation,artistic,war,sci,blood,hero,romantic,action
0,84779,0,0,0,1,0,0,0,0,0,0,0
1,191602,0,0,0,1,0,0,0,0,0,0,0
2,122159,0,0,0,1,0,0,0,0,0,0,0
3,165710,0,0,0,1,0,0,0,0,0,0,0
4,151313,0,0,0,1,0,0,0,0,0,0,0
5,203371,0,0,0,1,0,0,0,0,0,0,0
6,196336,0,0,0,0,0,0,1,0,0,0,0
7,114851,0,0,0,0,0,0,1,0,0,0,0
8,204292,0,0,0,0,0,0,1,0,0,0,0
9,143189,0,0,0,0,0,0,1,0,0,0,0


# TF-IDF
Calculate the importance of terms in documents

1. Construct dataframe with [movieid - all terms in dialog] relationship

In [9]:
df = pd.DataFrame(columns=["movieid", "dialog"])

for i in range(maximum_num):
  dig = ''
  for line in line2[i]:  # concatenate all sentences in related message dialog
    dig += str(line)
  
  for mv in movienum[i]:
    newrow = pd.DataFrame({'movieid': [mv], 'dialog': [dig]}, columns=df.columns)
    df = pd.concat([df, newrow], ignore_index=True)
    
df

Unnamed: 0,movieid,dialog
0,84779,hi there how are you i m looking for movie rec...
1,191602,hi there how are you i m looking for movie rec...
2,122159,hi there how are you i m looking for movie rec...
3,165710,hi there how are you i m looking for movie rec...
4,151313,hi there how are you i m looking for movie rec...
...,...,...
87,97819,hey what kind of movies do you like to watch i...
88,145338,hey what kind of movies do you like to watch i...
89,1960,hey what kind of movies do you like to watch i...
90,145338,hey what kind of movies do you like to watch i...


In [10]:
# Fill NaN with empty sentence
df['dialog'].fillna('', inplace=True)
df['dialog']

0     hi there how are you i m looking for movie rec...
1     hi there how are you i m looking for movie rec...
2     hi there how are you i m looking for movie rec...
3     hi there how are you i m looking for movie rec...
4     hi there how are you i m looking for movie rec...
                            ...                        
87    hey what kind of movies do you like to watch i...
88    hey what kind of movies do you like to watch i...
89    hey what kind of movies do you like to watch i...
90    hey what kind of movies do you like to watch i...
91    hey what kind of movies do you like to watch i...
Name: dialog, Length: 92, dtype: object

2. Make a TF-IDFs matrix

In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Vectorizer class
tfidf = TfidfVectorizer(stop_words='english')  # Ignore English Stopwords

# Obtain matrix
tfidf_mat = tfidf.fit_transform(df['dialog']).toarray()

# Construct dataset with id + word vectors
cdata = np.concatenate((df['movieid'].to_numpy().reshape(len(df['dialog']), 1), tfidf_mat), axis=1)
df_mv_tfidf = pd.DataFrame(cdata, columns=['id'] + tfidf.get_feature_names_out().tolist())


# Cosine Simliarity
Compute the similarity between words for recommendation

In [13]:
from numpy.linalg import norm

Recommendation function
* param:
    * data: array, vector space of texts.
    * mv: target movie's index
    * length: maximum length of recommendation
        * default: 5
    * simf: consine similarity function
        * default: dot(X, y) / (normalize(X) * normalize(Y) + 1e-7)

In [14]:
# Note: the consine similarity function's denominator has 1e-7 minimum value to avoid the divbyzero.
def recommend(data, mv, length=5, simf=lambda X, Y: np.dot(X,Y)/((norm(X)*norm(Y)) + 1e-7)):
    sim = []

    if df.loc[df['movieid'] == mv].empty:
        return sim
    
    idx = df[df['movieid'] == mv].index.values[0]

    for i in range(len(data)):
        if idx != i:
            sim.append((simf(data[i], data[idx]), df.loc[i]['movieid']))
    
    sim.sort()
    sim.reverse()
    return sim[:length]

# Example
Try a recommendation with movie id 79320 (= Contact  (2009))

In [15]:
pd.DataFrame(recommend(tfidf_mat, '79320', 10), columns=['Similarity', 'Movie Index'])

Unnamed: 0,Similarity,Movie Index
0,1.0,90950
1,1.0,84001
2,1.0,204292
3,1.0,204292
4,1.0,204292
5,1.0,163606
6,1.0,155969
7,1.0,147598
8,1.0,147598
9,0.308313,204322
