# Redial Parser
A separated library for parsing the redial dataset

class **RedialParser**
- Restore(): Restore train, test, and movie dataset to initial state
   * return:
        * None
- Movies(train): Get movie list in dataset
   * param:
        * train (bool): Target dataset, (train=True, test=False, all=None)
   * return:
        * dict: {index, MovieName}
- describe(): Describe its datasets
   * return:
        * None
- train: Train data of ReDial.
- test: Test data of ReDial.
- movie: Movie mention counts for ReDial

In [59]:
!mkdir redial_parser

���� ���͸� �Ǵ� ���� redial_parser��(��) �̹� �ֽ��ϴ�.


In [64]:
%%writefile redial_parser/__init__.py
# Note: Write down the parser class in the separated file, redial_parser.
# Since we want the word2vec notebook to contain the context of parser, we put the writefile to do that.

from copy import deepcopy

import json
import pandas


def load_data(path):
    """
    TODO: initialization function for dataset reads

        :arg
            path (str): Dataset path.
        :return
            tuple: (train, test, df_mention)
    """
    train_data = []
    for line in open(f"{path}/train_data.jsonl", "r"):
        train_data.append(json.loads(line))

    test_data = []
    for line in open(f"{path}/test_data.jsonl", "r"):
        test_data.append(json.loads(line))

    mention_dataframe = pandas.read_csv(f"{path}/movies_with_mentions.csv")

    return train_data, test_data, mention_dataframe



class RedialParser:
    def __init__(self, path):
        self.train, self.test, self.movie = load_data(path)

        self.__train = deepcopy(self.train)
        self.__test = deepcopy(self.test)
        self.__movie = deepcopy(self.movie)

        self.__model = None

    def Restore(self):
        """
        TODO: Restore train, test, and movie dataset to initial state
        """
        self.train = deepcopy(self.__train)
        self.test = deepcopy(self.__test)
        self.movie = deepcopy(self.__movie)

    def Movies(self, train=True) -> dict:
        """
        TODO: Get movie list in dataset

            :arg
                train (bool): Target dataset, (train=True, test=False, all=None)
            :return
                dict: {index, MovieName}
        """
        if train is None:
            result = self.Movies()
            result.update(self.Movies(False))
            return result

        target = None
        if train is True:
            target = self.train
        elif train is False:
            target = self.test

        result = {}

        if target is not None:
            for elem in target:
                result.update(elem['movieMentions'])

        return result

    def describe(self):
        """
        TODO: Describe its datasets
        """
        len1, len2 = len(self.train), len(self.test)
        n1, n2 = 0, 0
        m1, m2 = 0, 0

        for e in self.train:
            n1 += len(e['movieMentions'])
            m1 += len(e['messages'])
        for e in self.test:
            n2 += len(e['movieMentions'])
            m2 += len(e['messages'])

        print('Brief information:\n'
              f'Length of train data: {len1}\n'
              f'Length of test data: {len2}\n\n'
              'Data information:\n'
              f'Key parameters: {list(self.train[0].keys())}\n'
              f'Key parameters in Questions: {list(list(self.train[0]["respondentQuestions"].values())[0].keys())}\n'
              f'Key parameters in messages: {list(self.train[0]["messages"][0].keys())}\n\n'
              'Context information:\n'
              f'Total mentioned movie number (train): {n1}\n'
              f'Total mentioned movie number in unique (train): {len(self.Movies())}\n'
              f'Total message number (train): {m1}\n'
              f'Total mentioned movie number (test): {n2}\n'
              f'Total mentioned movie number in unique (test): {len(self.Movies(False))}\n'
              f'Total message number (test): {m2}\n'
              f'Average mentioned movie numbers per conversation (train): {n1 / len1}\n'
              f'Average message numbers per conversation (train): {m1 / len1}\n'
              f'Average mentioned movie numbers per conversation (test): {n2 / len2}\n'
              f'Average message numbers per conversation (test): {m2 / len2}\n\n'
              , end='')


Overwriting redial_parser/__init__.py


# NLTK - Natural Language toolkit
Import a library for NLP.
- download('all'): Clean all the meaningless words (punch, stopwords, etc.)

In [65]:
!python -m pip install nltk



In [66]:
import nltk
nltk.download('all')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]    | 

True

In [79]:
from redial_parser import RedialParser
print(RedialParser)

<class 'redial_parser.RedialParser'>
<class 'redial_parser.__init__.RedialParser'>


In [75]:
# Special character removal
import re

compile = re.compile("\W+")
line3 = []
name = []

if __name__ == "__main__":
    parser = RedialParser('../dataset')
    parser.describe()

    num=len(parser.train)
    print(num)
    movienum=[[] for _ in range(10)]
    line2 = [[] for _ in range(10)]
    print(line2)
    for i in range(10):
        print(f'Train {i}')
        for msg in parser.train[i]['messages']:
            print(f'{msg["senderWorkerId"]}: {msg["text"]}')
            # print(msg["text"])
            line2[i].append(msg["text"])
            line3.append(msg["text"])
        for j in range(len(line2[i])):
          a = compile.sub(" ",line2[i][j])
          line2[i][j] = a.lower()
          numbers = re.findall(r'\d+', line2[i][j])
          for k in range(len(numbers)):
            if len(numbers[k])>=4:
              movienum[i].append(numbers[k])
              name.append(numbers[k])
            

        print()

print(line2)
print(movienum)

NameError: name 'load_data' is not defined

영화에 해당하는 단어를 설정해서 count합니다.

In [None]:

import nltk
import pandas as pd

from nltk.corpus import stopwords
stop_word_eng = set(stopwords.words('english'))
for i in range(10):
  line2[i] = [j for j in line2[i] if j not in stop_word_eng]
print(line2)


# 표제어 추출 사용 시
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
token = RegexpTokenizer('[\w]+')
x={}
for i in range(10):
  result_pre_lem = [token.tokenize(j) for j in line2[i]]
  middle_pre_lem= [r for j in result_pre_lem for r in j]
  final_lem = [lemmatizer.lemmatize(j) for j in middle_pre_lem if not j in stop_word_eng] # 불용어 제거
  #print(final_lem)

  # 텍스트에서 많이 나온 단어
  # 표제어 추출
  english = pd.Series(final_lem)
  for j in english:
    if j in ['comedy','scary','love','animation','artistic','war','sci','blood','hero','romantic','action']:
      for k in movienum[i]:
        if k not in x:
          x[k]=dict()
          if j not in x[k]:
            x[k][j]=1
          else:
            x[k][j]+=1
        else:
          if j not in x[k]:
            x[k][j]=1
          else:
            x[k][j]+=1
print(x)

[['hi there how are you i m looking for movie recommendations', 'i am doing okay what kind of movies do you like ', 'i like animations like 84779 and 191602', 'i also enjoy 122159', 'anything artistic', 'you might like 165710 that was a good movie ', 'what s it about ', 'it has alec baldwin it is about a baby that works for a company and gets adopted it is very funny', 'that seems like a nice comedy', 'do you have any animated recommendations that are a bit more dramatic like 151313 for example', 'i like comedies but i prefer films with a little more depth', 'that is a tough one but i will remember something', ' 203371 was a good one', 'ooh that seems cool thanks for the input i m ready to submit if you are ', 'it is animated sci fi and has action', 'glad i could help', 'nice', 'take care cheers ', 'bye'], ['hi did you see 196336 ', 'yes it was a pretty good movie ', 'then you would like 114851 if you haven t seen it ', 'you like sci fi stuff like 204292 ', 'i am more of a 143189 kind 

tf-idf를 활용


어떤 단어가 특정 문서 내에서 얼마나 중요한 것인지를 나타내는 통계적 수치 활용

In [None]:
dat=[]
for i in range(len(line2)):
  c=''
  for k in range(len(line2[i])):
    c+=str(line2[i][k])
  for j in range(len(movienum[i])):
    dat.append([movienum[i][j],c])
# df = pd.DataFrame(line3,columns=["dialog"])
# print(df)
print(dat)
df = pd.DataFrame(dat,columns=["movieid","dialog"])
print(df)

10
10
[['84779', 'hi there how are you i m looking for movie recommendationsi am doing okay what kind of movies do you like i like animations like 84779 and 191602i also enjoy 122159anything artisticyou might like 165710 that was a good movie what s it about it has alec baldwin it is about a baby that works for a company and gets adopted it is very funnythat seems like a nice comedydo you have any animated recommendations that are a bit more dramatic like 151313 for examplei like comedies but i prefer films with a little more depththat is a tough one but i will remember something 203371 was a good oneooh that seems cool thanks for the input i m ready to submit if you are it is animated sci fi and has actionglad i could helpnicetake care cheers bye'], ['191602', 'hi there how are you i m looking for movie recommendationsi am doing okay what kind of movies do you like i like animations like 84779 and 191602i also enjoy 122159anything artisticyou might like 165710 that was a good movie wh

In [None]:
import pandas as pd

df['dialog'].isnull().sum() 
df['dialog']=df['dialog'].fillna('')
df['dialog'].isnull().sum() #0 으로 바뀜 내적하면 모두 0 나옴 
df['dialog']

0     hi there how are you i m looking for movie rec...
1     hi there how are you i m looking for movie rec...
2     hi there how are you i m looking for movie rec...
3     hi there how are you i m looking for movie rec...
4     hi there how are you i m looking for movie rec...
                            ...                        
87    hey what kind of movies do you like to watch i...
88    hey what kind of movies do you like to watch i...
89    hey what kind of movies do you like to watch i...
90    hey what kind of movies do you like to watch i...
91    hey what kind of movies do you like to watch i...
Name: dialog, Length: 92, dtype: object

In [None]:
# tfidf 행렬 생성 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(stop_words='english')#불용어 제거 
tfidf_mat=tfidf.fit_transform(df['dialog']).toarray()
print(tfidf_mat)
for i in range(len(tfidf_mat)):
  print(tfidf_mat[i])


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
 0.         0.         0.         0.         0.         0.
 0.22190762 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.06448928 0.         0.         0.
 0.08010722 0.         0.09619251 0.09619251 0.         0.
 0.11095381 0.         0.14607762 0.         0.         0.13608778
 0.         0.         0.         0.07281168 0.         0.
 0.08010722 0.         0.         0.11946788 0.         0.
 0.         0.05422825 0.         0.         0.         0.
 0.11095381 0.         0.         0.06996043 0.11095381 0.11095381
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.06125107 0.
 0.         0.         0.         0.         0.         0.08888693
 0.         0.2912467  0.         0.     

In [None]:
#코사인유사도함수 2 (분모=!0)
import numpy as np
from numpy.linalg import norm
def cos_sim2(X,Y):
    return np.dot(X,Y)/((norm(X)*norm(Y))+1e-7) # 분모가0이 되지 않도록 1e-7 값을 넣어줄 것 

#추천 함수 
def top_match_ar2(data, name, rank=5,simf=cos_sim2):
    sim=[]
    for i in range(len(data)):
        if name != i:
            sim.append((simf(data[i],data[name]),i))
    sim.sort()
    sim.reverse()
    return sim[:rank]

In [None]:
# Toystory와 코사인유사도가 비슷한 순으로 영화 추천 : (영화제목, 코사인유사도) 
movieList = [] 
for sim, movie_id in top_match_ar2(tfidf_mat,20,10):
    movieList.append((sim, df.loc[movie_id,'movieid']))
movieList[:10]

[(0.9999999000000099, '147598'),
 (0.9999999000000099, '147598'),
 (0.9999999000000099, '204292'),
 (0.9999999000000099, '204292'),
 (0.9999999000000099, '204292'),
 (0.9999999000000099, '84001'),
 (0.9999999000000099, '155969'),
 (0.9999999000000099, '90950'),
 (0.9999999000000099, '163606'),
 (0.30831348598192954, '152242')]