# 抓取 Marvel电影清单

  https://www.imdb.com/list/ls071217506/

In [3]:
import requests
from bs4 import BeautifulSoup

In [9]:
res = requests.get('https://www.imdb.com/list/ls071217506/')
soup = BeautifulSoup(res.text, 'lxml')

movies = []
for movie in soup.select('.lister-item-header a'):
    movies.append(movie.get('href').split('/')[2])

In [11]:
movies[:3]

['tt0371746', 'tt0800080', 'tt1228705']

# 抓取评论数据 

In [23]:
def getMovieReviews(movieid):
    reviews = []
    res = requests.get('https://www.imdb.com/title/{}/reviews?spoiler=hide&sort=helpfulnessScore&dir=desc&ratingFilter=10'.format(movieid))
    soup = BeautifulSoup(res.text, 'lxml')

    for review in soup.select('.imdb-user-review'):
        title = review.select_one('.title').text
        author = review.select_one('.display-name-link').text
        dt = review.select_one('.review-date').text
        content = review.select_one('.content .text').text
        reviews.append({'title': title, 'author': author, 'dt': dt, 'content': content})
    return reviews

In [29]:
totalReviews = []

for movieid in movies:
    reviews = getMovieReviews(movieid)
    totalReviews.extend(reviews)
    print(movieid)

tt0371746
tt0800080
tt1228705
tt0800369
tt0458339
tt0848228
tt1300854
tt1981115
tt1843866
tt2015381
tt2395427
tt0478970
tt3498820
tt1211837
tt3896198
tt3501632
tt1825683
tt4154756
tt4154664
tt4154858
tt4154796
tt2364582
tt3475734
tt3322312
tt2357547
tt3322314
tt1452299
tt2071614
tt2011109
tt2247732
tt3067038
tt3438640
tt3591568
tt4128102


In [33]:
import pandas
df = pandas.DataFrame(totalReviews)
df.count()

title      637
author     637
dt         637
content    637
dtype: int64

In [34]:
pandas.DataFrame(df)

Unnamed: 0,title,author,dt,content
0,Delivers Intelligence & Great Acting with its...,sacflyzone,23 April 2008,"Rest assured, Iron Man is an absolutely amazin..."
1,Film That Lives Up To Its Hype And The Expect...,z-mbe,16 April 2008,"As an avid reader of the Iron Man comics, I wa..."
2,"Wow, very impressive !!!!\n",ffiisshh,21 April 2008,WOW WOW WOW.This is the movie I have been wait...
3,A Nutshell Review: Iron Man\n,DICK STEEL,30 April 2008,"With a little tinge of shame and regret, my ra..."
4,Iron-Badass\n,SubZeroMK,31 May 2009,I was hyped for the movie when it was on the i...
...,...,...,...,...
632,Awesome series\n,akogho,1 July 2018,"Great acting, directing, and storyline. All ma..."
633,Great!\n,nmchassykowrld,23 June 2018,Just as good as the first season and that surp...
634,"A++, a brilliant short-movie\n",marto_dd,21 August 2013,Last year I was really pleased with Item 47. I...
635,My favorite Marvel One Shot film\n,Terryfan,5 April 2016,After viewing every Marvel One Shot movie this...


In [35]:
df.head()

Unnamed: 0,title,author,dt,content
0,Delivers Intelligence & Great Acting with its...,sacflyzone,23 April 2008,"Rest assured, Iron Man is an absolutely amazin..."
1,Film That Lives Up To Its Hype And The Expect...,z-mbe,16 April 2008,"As an avid reader of the Iron Man comics, I wa..."
2,"Wow, very impressive !!!!\n",ffiisshh,21 April 2008,WOW WOW WOW.This is the movie I have been wait...
3,A Nutshell Review: Iron Man\n,DICK STEEL,30 April 2008,"With a little tinge of shame and regret, my ra..."
4,Iron-Badass\n,SubZeroMK,31 May 2009,I was hyped for the movie when it was on the i...


In [36]:
df['content'][0]

'Rest assured, Iron Man is an absolutely amazing movie. I won\'t dare spoil any of this remarkable movie for you but I do recommend it as highly as I possibly can. Marvel needed to get in to the solo movie making business long ago. Instead of leasing out their characters to other studios, they\'re making movies themselves. Most everyone knows Iron Man is their first effort and what a great lead off film! This movie helps take the comic book genre to the highest level. Just like they did in the books, they reinvent standard epic adventure by "Marvelizing" characters and making them more believable. The Spider-Man and the X-Men movies did this to a degree but only as far as their respective studios wished to stay true to the source material. Anything added or amended was for the benefit of the live action adaptation. Director Sam Raimi pulled this off by talking to the summer crowd, not down to them with the Spider-Man series. Jon Favreau has done the same thing here but I think he\'s do

# 储存评论数据

In [37]:
df.to_csv('movie_review.csv', encoding = 'utf-8-sig')

# 读取IMDB 文字数据

In [38]:
import keras
import numpy as np
import pandas

Using TensorFlow backend.


In [43]:
df = pandas.read_csv('movie_review.csv')
#df.head()
reviews = ''.join([' <SOR> ' + ele + ' <EOR> ' for ele in df['content'].head(300).tolist()]).lower()
len(reviews)

509975

# 产生训练用序列与目标

In [54]:
# 训练用序列长度
maxlen = 60

# 随机采样序列
step = 2

sentences = []
next_chars = []

for i in range(0, len(reviews) - maxlen, step):
    # 根据步骤， 每次取出 maxlen 的序列作为特征
    sentences.append(reviews[i: i + maxlen])
    # 取出maxlen + 1 的字作为当目标
    next_chars.append(reviews[i + maxlen])

In [55]:
len(sentences)

254958

# one-hot 编码

In [140]:
# 找出独特字元
chars = sorted(list(set(reviews)))
#print(chars)
#chars

In [141]:
# 利用字典映射char 和索引
char_indices = dict((char, chars.index(char)) for char in chars)
#char_indices

In [142]:
# x: 训练样本
x = np.zeros((len(sentences), maxlen, len(chars)), dtype = np.bool)

# y: 目标样本
y = np.zeros((len(sentences), len(chars)), dtype = np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [143]:
x.shape

(254958, 60, 73)

In [144]:
y.shape

(254958, 73)

# 建立 LSTM 模型

In [145]:
from keras import layers

model = keras.models.Sequential()

# 输入为60 * 字串长度
model.add(layers.LSTM(128, return_sequences = True, input_shape=(maxlen, len(chars))))

model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))

# 以字元对应作为输出
model.add(layers.Dense(len(chars), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer = 'adam')

# 训练模型

In [146]:
from keras.callbacks import ModelCheckpoint

In [147]:
# imdb.hdf5 是一种高压缩的文档格式， save_best_only 储存最好的结果
checkpoint = ModelCheckpoint('imdb.hdf5', monitor = 'loss', verbose = 1, save_best_only=True)

In [None]:
history = model.fit(x, y, batch_size=128, epochs = 10, callbacks=[checkpoint])

# 取得字元与索引的对应

In [123]:
char_indices = dict((char, chars.index(char)) for char in chars)

# 读取模型

In [124]:
from keras.models import load_model

In [125]:
model = load_model('imdb.hdf5')

# 随机采样文字

In [126]:
def softmax(x, t):
    return np.exp(x / t) / np.sum(np.exp(x))

x = np.array([0.3, 0.5, 0.8, 0.9])
softmax(x, 1)

array([0.17567768, 0.2145732 , 0.28964352, 0.3201056 ])

In [127]:
def sample(preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    # 回传多项式分布的概率
    probas = np.random.multinomial(1, preds, 1)
    # 回传最大Index
    return np.argmax(probas)

# 文本生成 

In [128]:
import sys
def random_reviews(temperature):
    generated_text = 'rest assured, iron man is an absolutely amazing movie. i won'
    generated_text = generated_text[0:60]
    
    for i in range(600):
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1
            
        preds = model.predict(sampled, verbose = 0)[0]
        next_index = sample(preds, temperature)
        next_char = chars[next_index]
        generated_text += next_char
        generated_text = generated_text[1:]
        
        sys.stdout.write(next_char)
        sys.stdout.flush()
    print(generated_text)

In [139]:
random_reviews(0.5)

thare and you his comperion was with the firaction in a did this movie are it a see the back but the was to make i can for the firmal and and coneing way artore of the marvel man all seilaving in the expectast well of the are to all of the action the alloth it to in a movie i way way it of the film as the movie it not is of the movie all film mowery and the whith are it with a film in the movie fear toues the film are with the movie is a great the avengers of a seruling were the of the avengers and the film that the mestare it i was the beath to the all marvel a roaly the movie it this film tohe beath to the all marvel a roaly the movie it this film to
