In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib as mpl 
import matplotlib.pyplot as plt
import seaborn as sns
#sets up pandas table display
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

In [None]:
df_movies = pd.read_csv('Scraped_Data.csv')

In [None]:
df_movies.head()

In [None]:
sns.set()
sns.kdeplot(df_movies.Rating);

In [None]:
#这个散点图说明了长久以来，美国电影一直都是高质量
clean_time = []
for i in df_movies['Relaese Time']:
    clean_time.append(i.split('-')[0][:4])
clean_time.sort()

plt.figure(figsize=(20,10))
plt.xticks(rotation = 300)
plt.scatter(clean_time, df_movies['Making Area'])
ax = plt.gca()
ax.invert_yaxis()

# plt.grid(axis = 'y',linestyle='-')
sns.despine()

In [None]:
film_area = df_movies['Making Area']
film_area

In [None]:
country_dic = {}
for countries in film_area:
    for elem in countries.split(','):
        if elem not in country_dic:
            country_dic[elem] = 1
        else:
            country_dic[elem] = country_dic[elem] + 1
# country_dic = sorted(country_dic.items(), key=lambda x: x[1], reverse=True)
country_sourted = sorted(country_dic.items(), key=lambda x: x[1], reverse=False)
country_sourted

In [None]:
pie_x_label = [58, 15,10,9,8,8,7,29]
pie_y_label = ['美国','英国','日本','中国大陆','法国','德国','意大利','其他国家']
bar_x_label = []
bar_y_label = []
for elem in country_sourted:
    bar_x_label.append(elem[0])
    bar_y_label.append(elem[1])

In [None]:
bar_df_prepare = {
    'country': bar_x_label,
    'counts': bar_y_label,
}
bar_df = pd.DataFrame.from_dict(bar_df_prepare)[['country','counts']]
bar_df.head()

In [None]:
#这个图说明百大里面在美国制作的电影的比例
plt.axes(aspect='equal')
explode = [0.1,0,0,0,0,0,0,0,]
plt.pie(x = pie_x_label, labels=pie_y_label,autopct='%.f%%',radius = 2, explode=explode,
        textprops = {'fontsize':20, 'color':'black'})
plt.show()

In [None]:
#这个图说明百大里面有多少部美国电影
x = bar_x_label
y = bar_y_label
plt.figure(figsize=(15,5))
ax = plt.gca()
draw = ax.bar(x, y, label='{}'.format(y))
ax.set_title('Movie  Counts  Making  in  Different  Area',fontsize=15)
for a, draw in zip(x, y):
    ax.text(a, draw+1, draw, ha='center', va='bottom')
plt.xlim((0,10))
plt.ylim((0,65))
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.xticks(range(-1,len(x) + 1))
ax.set_ylabel('Counts',fontsize=15, labelpad=20)
sns.despine()

In [None]:
index_area_dic = {}
for i in range(100):
    tmp = df_movies['Making Area'][i].split(',')
    for elem in tmp:
        if elem not in index_area_dic:
            index_area_dic[elem] = [i]
        else:
            index_area_dic[elem].append(i)

av_income_dic = {}

for key, value in index_area_dic.items():
    tmp_av_income = 0
    tmp_total_income = 0
    total_counts_without_income = 0
    for index in value:
        tmp_each_income = df_movies['Cumulative_income(ten thousand unit)'][index]
        if not str(tmp_each_income) == 'nan':
            tmp_total_income += tmp_each_income
        else:
            total_counts_without_income =  total_counts_without_income + 1
        if (len(value)-total_counts_without_income) == 0:
            continue
        else:
            tmp_av_income = tmp_total_income / (len(value)-total_counts_without_income)
    av_income_dic[key] = int(tmp_av_income)


av_income_x = []
av_income_y = []
av_income_sorted = sorted(av_income_dic.items(), key=lambda x: x[1], reverse=True)
for elem in av_income_sorted:
    av_income_x.append(elem[0])
    av_income_y.append(elem[1])
av_income_y

In [None]:
#这个图说明各地区平均票房
x = av_income_x
y = av_income_y
plt.figure(figsize=(15,5))
ax = plt.gca()
draw = ax.bar(x, y, label='{}'.format(y))
ax.set_title('Average Box Office',fontsize=15)
for a, draw in zip(x, y):
    ax.text(a, draw+1, draw, ha='center', va='bottom')
# plt.xlim((0,10))
# plt.ylim((0,65))
plt.grid(axis = 'y', color ='white', linestyle='-')
plt.xticks(range(-1,len(x) + 1))
sns.despine()

In [None]:
comparison_US_dic = {}
comparison_US_index = []
for index in index_area_dic['美国']:
    if not str(df_movies['Cumulative_income(ten thousand unit)'][index]) == 'nan':
        comparison_US_index.append(index)
for index in comparison_US_index:
    comparison_US_dic[df_movies['Relaese Time'][index]] = int(df_movies['Cumulative_income(ten thousand unit)'][index])
comparison_US_dic

comparison_US_sorted = sorted(comparison_US_dic.items(), key=lambda x: x[0], reverse=False)
comparison_US_x = []
comparison_US_y = []
for elem in comparison_US_sorted:
    comparison_US_x.append(int(elem[0].split('-')[0]))
    comparison_US_y.append(elem[1])
comparison_US_x

In [None]:
comparison_CN_mainland_dic = {}
comparison_CN_mainland_index = []
for index in index_area_dic['中国大陆']:
    if not str(df_movies['Cumulative_income(ten thousand unit)'][index]) == 'nan':
        comparison_CN_mainland_index.append(index)
for index in comparison_CN_mainland_index:
    comparison_CN_mainland_dic[df_movies['Relaese Time'][index]] = int(df_movies['Cumulative_income(ten thousand unit)'][index])
comparison_CN_mainland_dic

comparison_CN_mainland_sorted = sorted(comparison_CN_mainland_dic.items(), key=lambda x: x[0], reverse=False)
comparison_CN_mainland_x = []
comparison_CN_mainland_y = []
for elem in comparison_CN_mainland_sorted:
    comparison_CN_mainland_x.append(int(elem[0].split('-')[0]))
    comparison_CN_mainland_y.append(elem[1])
comparison_CN_mainland_x

In [None]:
plt.figure(figsize=(15,5))
plt.plot(comparison_CN_mainland_x, comparison_CN_mainland_y, 'r',label='China Mainland')
plt.scatter(comparison_US_x, comparison_US_y)
plt.plot(comparison_US_x, comparison_US_y, 'b',label='America')
plt.scatter(comparison_CN_mainland_x, comparison_CN_mainland_y)
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.legend(loc=2,borderpad=2,fontsize = 12)
plt.title('Box office variation',fontsize=15)
x_ticks = np.linspace(1993,2020,10)
plt.xticks(x_ticks,size = 10)
sns.despine()