#### 画图

In [39]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

In [36]:
data_csv = pd.read_csv("./data_movies_noRepeat.csv")
data_csv[["vote_count", "actor_count", "score"]].describe()

Unnamed: 0,vote_count,actor_count,score
count,23422.0,23422.0,23422.0
mean,28977.72,13.432969,7.278507
std,104230.8,12.3769,0.972857
min,301.0,0.0,4.4
25%,1144.0,5.0,6.6
50%,3026.0,11.0,7.3
75%,12092.5,18.0,8.0
max,2604521.0,387.0,9.7


#### Number distribution of movie labels

In [4]:
data_analysis_label_num = data_csv["types"].apply(lambda x: len(x.replace("[", '').replace("]", '').split(",")))
fig = {
  "data": [
    {
      "values": data_analysis_label_num.value_counts().values,
      "labels": data_analysis_label_num.value_counts().index,
      "domain": {"x": [0, .5]},
      "name": "Number Of",
      "hoverinfo":"label+percent",
      "hole": .3,
      "type": "pie"
    },],
  "layout": {
        "title":"Number distribution of movie labels",
                'height':600,
                "width":800
    }
}
iplot(fig)

#### Distribution of film actors

In [34]:
trace1 =go.Scatter(
 x = data_csv.index,
 y = data_csv["actor_count"],
 mode = "markers",
 name = "Distribution of film actors",
 marker = dict(color = 'rgba(128, 128, 255, 0.8)'),
 text= data_csv['title'])

#3.定义layout对象
layout = dict(xaxis= dict(title= 'film index',ticklen= 5,zeroline= False),
 yaxis= dict(title= 'Number of film actors',ticklen= 5,zeroline= False),
 height = 600,
 width = 1300
 )
#4.将graph部分和layout部分组合成figure对象，绘图
fig = dict(data = trace1, layout = layout)
iplot(fig)

#### Distribution of score

In [35]:
trace1 = go.Bar(
 x = data_csv['score'].value_counts().index,
 y = data_csv['score'].value_counts().values,
 name = "citations",
 marker = dict(color = 'rgba(255, 0, 0, 0.5)',
 line=dict(color='rgb(0,0,0)',width=1.5)))

data = [trace1]

layout = go.Layout(barmode = "group", yaxis=dict(title= 'Number of films per score',ticklen= 5,zeroline= False),
                   xaxis=dict(title= 'Score',zeroline= False,tickmode = 'array',tickvals = data_csv['score'].value_counts().index),
                   title="Distribution of score", height = 600, width = 1300)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

#### Score distribution for each type

In [None]:
types_list = []
types_num = []
types_score = []
types = data_csv["types"].apply(lambda x: x.replace("[", '').replace("]", '').replace("'", '').replace(" ", '').split(","))
for item_index in types:
    for item in item_index:
        types_list.append(item)
types_list = np.unique(np.array(types_list))
for item in range(len(types_list)):
    types_num.append(sum(types.apply(lambda x: 1 if (np.array(x) == types_list[item]).any() else 0).values))
    types_score.append(data_csv.loc[types.apply(lambda x: (np.array(x) == types_list[item]).any()).values, 'score'].values)

In [78]:
trace1 = go.Bar(
 x = types_list,
 y = types_num,
 name = "citations",
 marker = dict(color = 'rgba(0, 0, 255, 0.5)',
 line=dict(color='rgb(0,0,0)',width=1.5)))

data = [trace1]

layout = go.Layout(barmode = "group", yaxis=dict(title= 'Number of films per type',ticklen= 5,zeroline= False),
                   xaxis=dict(title= 'Type',zeroline= False,tickmode = 'array',tickvals = types_list),
                   title="Distribution of Type", height = 600, width = 1000)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

In [82]:
data = []
for item_score_index in range(len(types_score)):
    data.append(go.Box(y = types_score[item_score_index],
                       name = types_list[item_score_index]))
layout = dict(xaxis= dict(title= 'Type',ticklen= 5,zeroline= False),
 yaxis= dict(title= 'Score distribution for each type',ticklen= 5,zeroline= False),
 height = 600,
 width = 1300)
fig = go.Figure(data = data, layout = layout)
iplot(fig)