## Importing the libraries

In [1]:
## importing important libraries
import wikipedia
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Checking the validity of the query

In [2]:
## This function is written to check the validity of the query, i.e., if this query is a valid movie query or not.
## It is assumed that every movie query has a plot section in it.
def link_check(name):
    try:
        film_page = wikipedia.page('{}'.format(name))
        film_plot = film_page.section('Plot')
        if film_plot == None:
            return False
        else:
            return True
    except:
        return False

In [3]:
## This block can be used to check if any query is a valid movie query or not.
## This block is for debugging purpose.
film_check = link_check('the battle of algiers 1967 film')
print(film_check)

False


## Data Extraction

In [4]:
## This block will will fetch us the data for a valid movie query

def film_data_process(film):
    '''
    Returned data will be in a list which will have four features as asked i.e. Movie title,Released date, Star cast and plot 

    '''
    ## As the sanctity of the movie query is already tested, so we are initiating the release date and star cast variable
    film_release = ' '
    film_cast = ' '
    film_page = wikipedia.page('{}'.format(film))
    film_title = film_page.title
    film_url = film_page.url
    film_df = pd.read_html('{}'.format(film_url))
    ## Below line will fetch info box of the wikipedia page from which we will fetch us the star cast and release date 
    film_info = film_df[0]
    col = film_info.columns
    for i in range(len(film_info)):
        if film_info[col[0]].iloc[i] == 'Starring':
            film_cast = film_info[col[1]].iloc[i]
        elif film_info[col[0]].iloc[i] == 'Release date':
            film_release = film_info[col[1]].iloc[i]
        else:
            continue

    film_plot = film_page.section('Plot')
    film_data_list = [film_title,film_release,film_cast,film_plot]
            
    return film_data_list

Star cast can also be extracted from the cast section of the page. But will raise another network query and will slow down the program.

In [5]:
## This code block is written for debugging purpose to fetch the data for a single query.
data = film_data_process('All About Eve 1950 film')
for i in data:
    print(i)
    print(type(i))
print(type(data))

All About Eve
<class 'str'>
October 13, 1950
<class 'str'>
Bette Davis Anne Baxter George Sanders Celeste Holm
<class 'str'>
Margo Channing (Bette Davis) is one of the biggest stars on Broadway. But having just turned forty she is worried about what her advancing age will mean for her career. After a performance of Margo's latest play, Aged in Wood, Margo's close friend Karen Richards (Celeste Holm), wife of the play's author Lloyd Richards (Hugh Marlowe), brings in a besotted fan, Eve Harrington (Anne Baxter), to meet Margo. Eve tells the group gathered in Margo's dressing room—Karen, Lloyd, Margo's boyfriend Bill Sampson (Gary Merrill), a director who is eight years her junior, and Margo's maid Birdie (Thelma Ritter)—that she followed Margo's last theatrical tour to New York City after seeing her perform in San Francisco. She tells an engrossing story of growing up poor in Wisconsin and losing her young husband Eddie in the South Pacific during World War II. Moved, Margo quickly befr

## Construction of list of the movies

Top  1000 movies of all time are taken from the link http://www.films101.com/t1000r.htm 

In [6]:
## Construction of a dataframe from the given website

movies_list_df_1 = pd.read_html('http://www.films101.com/t1000r.htm')
movies_list_df_2 = pd.read_html('http://www.films101.com/t1000r2.htm')
movies_list_df_3 = pd.read_html('http://www.films101.com/t1000r3.htm')

In [7]:

## This function will return the movie list from the df passed in a list format.
## This function will vary based on the nature of the dataframe passed.

def list_construction(mov_list):
    movie_list = []
    len_list = len(mov_list)
    ## i represents every alternate odd column of the dataframe consists the desired list whose entries are also dataframe.
    for i in range(1,len_list,2):
        ## 1st column represnts the movie name
        ## 2nd column represnts the release year
        ## j represents the row.
        for j in range(len(mov_list[i])):
            name = mov_list[i][1][j] + ' ' + str(mov_list[i][2][j]) +' film'
            movie_list.append(name)
    return movie_list



In [8]:
## converting the dataframe into the list of movies contained in that page.
list_movie_1  = list_construction(movies_list_df_1)
list_movie_2  = list_construction(movies_list_df_2)
list_movie_3  = list_construction(movies_list_df_3)

In [9]:
## function of appending the each entry of movie list passed to form a single list rather than list of list
list_movie = []
def movie_append(mov_list):
    for i in range(len(mov_list)):
        list_movie.append(mov_list[i])

In [10]:
movie_append(list_movie_1)
movie_append(list_movie_2)
movie_append(list_movie_3)

In [11]:
len(list_movie)

1105

In [12]:
## for debugging purpose
## This will fetch us the list of the index and the movie name at that index for any given range(can be used to print complete movie range)
## This will also help us to get the final movie list with valid movie query
ind = 0
for i in range(10):
    if link_check(list_movie[i]):
        ind = ind + 1
        print('The film "{}" has index:{}'.format(list_movie[i],ind))
    else:
        continue

The film "Citizen Kane 1941 film" has index:1
The film "2001: A Space Odyssey 1968 film" has index:2
The film "The Rules of the Game 1939 film" has index:3
The film "Bicycle Thieves 1948 film" has index:4
The film "Vertigo 1958 film" has index:5
The film "The Godfather 1972 film" has index:6
The film "Seven Samurai 1954 film" has index:7
The film "Psycho 1960 film" has index:8
The film "Singin' in the Rain 1952 film" has index:9
The film "The Searchers 1956 film" has index:10


In [13]:
## Construction of the film_df with all the desired features
film_df = pd.DataFrame(data=None, columns='Movie_Title Release_date Movie_cast Movie_plot'.split())
for i in range(len(list_movie)):
    print(i) ## To track the progress
    if link_check('{}'.format(list_movie[i])):
        film_data_cache = film_data_process('{}'.format(list_movie[i]))
        film_df_2 = pd.DataFrame(data = [film_data_cache], columns='Movie_Title Release_date Movie_cast Movie_plot'.split())
        film_df = pd.concat([film_df,film_df_2]) 
    else:
        continue

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103




  lis = BeautifulSoup(html).find_all('li')


104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353


In [14]:
film_df.head()

Unnamed: 0,Movie_Title,Release_date,Movie_cast,Movie_plot
0,Citizen Kane,"May 1, 1941Palace Theatre) September 5, 1941",Orson Welles Joseph Cotten Dorothy Comingore E...,"In a mansion called Xanadu, part of a vast pal..."
0,2001: A Space Odyssey (film),2 April 1968Uptown Theater) 3 April 1968 15 Ma...,Keir Dullea Gary Lockwood,"In the prehistoric African veldt, a tribe of h..."
0,The Rules of the Game,7 July 1939 (Paris),Nora GregorPaulette DubostMarcel DalioRoland T...,Aviator André Jurieux (Roland Toutain) lands a...
0,Bicycle Thieves,24 November 1948,Enzo Staiola Lamberto Maggiorani,In the post-World War II Val Melaina neighbour...
0,Vertigo (film),"May 9, 1958",James Stewart Kim Novak Barbara Bel Geddes Tom...,"After a rooftop chase, where a fellow policema..."


In [15]:
film_df.reset_index(drop = True, inplace = True)
film_df.head()

Unnamed: 0,Movie_Title,Release_date,Movie_cast,Movie_plot
0,Citizen Kane,"May 1, 1941Palace Theatre) September 5, 1941",Orson Welles Joseph Cotten Dorothy Comingore E...,"In a mansion called Xanadu, part of a vast pal..."
1,2001: A Space Odyssey (film),2 April 1968Uptown Theater) 3 April 1968 15 Ma...,Keir Dullea Gary Lockwood,"In the prehistoric African veldt, a tribe of h..."
2,The Rules of the Game,7 July 1939 (Paris),Nora GregorPaulette DubostMarcel DalioRoland T...,Aviator André Jurieux (Roland Toutain) lands a...
3,Bicycle Thieves,24 November 1948,Enzo Staiola Lamberto Maggiorani,In the post-World War II Val Melaina neighbour...
4,Vertigo (film),"May 9, 1958",James Stewart Kim Novak Barbara Bel Geddes Tom...,"After a rooftop chase, where a fellow policema..."


In [16]:
film_df.to_csv('test_data.csv')