# Movie Success pipeline

In [None]:
EXPERIMENT_NAME = 'movie-success'
BUCKET = "your-bucket-name"

## Imports

In [None]:
import kfp
from kfp import compiler
import kfp.components as comp
import kfp.dsl as dsl
from kfp import gcp

## Load components

In [None]:
preprocess_operation = kfp.components.load_component_from_url(
    'https://storage.googleapis.com/{}/components/preprocess/component.yaml'.format(BUCKET))
help(preprocess_operation)

train_operation = kfp.components.load_component_from_url(
    'https://storage.googleapis.com/{}/components/train/component.yaml'.format(BUCKET))
help(train_operation)

ai_platform_deploy_operation = comp.load_component_from_url(
    "https://storage.googleapis.com/{}/components/deploy/component.yaml".format(BUCKET))
help(ai_platform_deploy_operation)

## Build the Pipeline

In [None]:
@dsl.pipeline(
  name='Movie Successs Pipeline',
  description='Performs preprocessing, training and deployment.'
)
def pipeline():
    
    preprocess_task = preprocess_operation(
        input_1_uri='gs://kubeflow-examples-data/named_entity_recognition_dataset/ner.csv,  # Upload data to GCS and drop URL
        output_y_uri_template="gs://{}/{{workflow.uid}}/preprocess/y/data".format(BUCKET),
        output_x_uri_template="gs://{}/{{workflow.uid}}/preprocess/x/data".format(BUCKET),
        output_preprocessing_state_uri_template="gs://{}/{{workflow.uid}}/model".format(BUCKET)
    ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) 
    
    
    train_task = train_operation(
        input_x_uri=preprocess_task.outputs['output-x-uri'],
        input_y_uri=preprocess_task.outputs['output-y-uri'],
        input_job_dir_uri="gs://{}/{{workflow.uid}}/job".format(BUCKET),
        output_model_uri_template="gs://{}/{{workflow.uid}}/model".format(BUCKET)
    ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) 
    
    
    deploy_task = ai_platform_deploy_operation(
        model_path= train_task.output,
        model_name="movie_success_kubeflow",
        model_region="us-central1",
        model_version="version1",
        model_runtime_version="2.3",
        model_prediction_class="model_prediction.CustomModelPrediction",
        model_python_version="3.7",
        model_package_uris="gs://{}/routine/custom_prediction_routine-0.2.tar.gz".format(BUCKET)
    ).apply(kfp.gcp.use_gcp_secret('user-gcp-sa'))

## Compile the Pipeline

In [None]:
pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'

import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)

## Create a Kubeflow Experiment

In [None]:
client = kfp.Client()

try:
    experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)
except:
    experiment = client.create_experiment(EXPERIMENT_NAME)
    
print(experiment)

## Run the Pipeline

In [None]:
arguments = {}

run_name = pipeline_func.__name__ + ' run'
run_result = client.run_pipeline(experiment.id, 
                                 run_name, 
                                 pipeline_filename, 
                                 arguments)

print(experiment.id)
print(run_name)
print(pipeline_filename)
print(arguments)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
df_credits = pd.read_csv('../data/tmdb_5000_credits.csv/tmdb_5000_credits.csv') 
df_movies = pd.read_csv('../data/tmdb_5000_movies.csv/tmdb_5000_movies.csv') 

In [7]:
df_credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [8]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [9]:
df_credits.rename(columns = {'movie_id':'id'}, inplace = True)

In [10]:
df_movies.drop('original_title', axis = 1, inplace = True)

In [12]:
df_merged = pd.merge(df_credits, df_movies, on = ['id','title'])
df_merged.head(3)

Unnamed: 0,id,title,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,"In the 22nd century, a paraplegic Marine is di...",...,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,"Captain Barbossa, long believed to be dead, ha...",...,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",6.9,4500
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,A cryptic message from Bond’s past sends him o...,...,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,6.3,4466


In [14]:
df_merged.columns

Index(['id', 'title', 'cast', 'crew', 'budget', 'genres', 'homepage',
       'keywords', 'original_language', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
df_merged.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,4803.0,57165.48,88694.61,5.0,9014.5,14629.0,58610.5,459488.0
budget,4803.0,29045040.0,40722390.0,0.0,790000.0,15000000.0,40000000.0,380000000.0
popularity,4803.0,21.4923,31.81665,0.0,4.66807,12.92159,28.3135,875.5813
revenue,4803.0,82260640.0,162857100.0,0.0,0.0,19170000.0,92917190.0,2787965000.0
runtime,4801.0,106.8759,22.61193,0.0,94.0,103.0,118.0,338.0
vote_average,4803.0,6.092172,1.194612,0.0,5.6,6.2,6.8,10.0
vote_count,4803.0,690.218,1234.586,0.0,54.0,235.0,737.0,13752.0


In [17]:
from ast import literal_eval

json_cols = ['cast', 'crew', 'genres', 'keywords','production_companies', 'production_countries','spoken_languages']

for col in json_cols:
    df_merged[col] = df_merged[col].apply(literal_eval)

In [18]:
df_merged['genres']

0       [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
1       [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2       [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
3       [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
4       [{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...
                              ...                        
4798    [{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...
4799    [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...
4800    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4801                                                   []
4802                  [{'id': 99, 'name': 'Documentary'}]
Name: genres, Length: 4803, dtype: object

In [19]:
def get_genre(x):
    if(isinstance(x, list)):
        genre = [i['name'] for i in x]
    
    return genre

def get_jobs(x):
    if(isinstance(x, list)):
        jobs = [i['job'] for i in x]
    return jobs

def get_characternames(x):
    if(isinstance(x, list)):
        chr_name = [i['character'] for i in x]
        countc = 0
        for j in chr_name:
            if('(voice)' in j):
                countc += 1
        if(len(chr_name)!=0):
            return (countc/len(chr_name))
        else:
            return 0
        
def get_labels(x):
    if(len(x)==0):
        return np.nan
    elif('Animation' in x):
        return 1
    else:
        return 0
    
def get_costume_labels(x):
    if 'Costume Design' in x:
        return 1
    else:
        return 0
    
def get_genre_cd(x):
    if(isinstance(x, list)):
        dept = [i['department'] for i in x]
    if 'Lighting' in dept:
        return 0
    else:
        return 1

In [39]:
df_merged['genres'] = df_merged['genres'].apply(get_genre)
df_merged['crew_jobs'] = df_merged['crew'].apply(get_jobs)
df_merged['percent_of_voice_artists'] = df_merged['cast'].apply(get_characternames)
df_merged['labels'] = df_merged['genres'].apply(get_labels)

In [40]:
df_merged.head(2)

Unnamed: 0,id,title,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count,crew_jobs,percent_of_voice_artists,labels
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,"In the 22nd century, a paraplegic Marine is di...",...,2787965087,162.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,7.2,11800,"[Editor, Production Design, Sound Designer, Su...",0.0,0.0
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,"Captain Barbossa, long believed to be dead, ha...",...,961000000,169.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",6.9,4500,"[Director of Photography, Director, Producer, ...",0.029412,0.0


In [46]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [48]:
for x in range(0,len(df_merged['percent_of_voice_artists'])):
    df_merged['percent_of_voice_artists'][x] = np.round(df_merged['percent_of_voice_artists'][x],3)

In [49]:
df_merged.head(3)

Unnamed: 0,id,title,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,revenue,runtime,spoken_languages,status,tagline,vote_average,vote_count,crew_jobs,percent_of_voice_artists,labels
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,"In the 22nd century, a paraplegic Marine is di...",...,2787965087,162.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,7.2,11800,"[Editor, Production Design, Sound Designer, Su...",0.0,0.0
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,"Captain Barbossa, long believed to be dead, ha...",...,961000000,169.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",6.9,4500,"[Director of Photography, Director, Producer, ...",0.029,0.0
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...",245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,"[{'id': 470, 'name': 'spy'}, {'id': 818, 'name...",en,A cryptic message from Bond’s past sends him o...,...,880674609,148.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,A Plan No One Escapes,6.3,4466,"[Original Music Composer, Director, Set Decora...",0.0,0.0


In [50]:
df_merged.labels.isna().sum()

28

In [56]:
idxsc = df_merged[((df_merged.labels != 1) & (df_merged.labels != 0))].index
df_merged.drop(idxsc, inplace = True)
df_merged.reset_index(drop= True, inplace= True)

In [57]:
df_merged.labels.isna().sum()

0

In [58]:
df_merged.isna().sum()

id                             0
title                          0
cast                           0
crew                           0
budget                         0
genres                         0
homepage                    3068
keywords                       0
original_language              0
overview                       3
popularity                     0
production_companies           0
production_countries           0
release_date                   0
revenue                        0
runtime                        2
spoken_languages               0
status                         0
tagline                      819
vote_average                   0
vote_count                     0
crew_jobs                      0
percent_of_voice_artists       0
labels                         0
dtype: int64

In [59]:
AnimatedMoviesCount = np.sum(df_merged['labels'] == 1)
NotAnimatedMoviesCount = np.sum(df_merged['labels'] == 0)

print("Number of Animated Movies are: ", AnimatedMoviesCount)
print("Number of Not Animated Movies are: ", NotAnimatedMoviesCount)

Number of Animated Movies are:  234
Number of Not Animated Movies are:  4541


In [61]:
df_merged['costume'] = df_merged['crew_jobs'].apply(get_costume_labels)

df_merged.costume.value_counts()

1    2472
0    2303
Name: costume, dtype: int64

In [62]:
df_merged['lighting_dept'] = df_merged['crew'].apply(get_genre_cd)


df_merged.lighting_dept.value_counts()

1    3881
0     894
Name: lighting_dept, dtype: int64

In [63]:
c = np.where(df_merged.labels==1)[0]
sum_budget = 0
for x in c:
    sum_budget += df_merged.budget[x]
avg_budget = sum_budget/len(c)
print("Average Budget of Animated Movie: ",str(avg_budget))

Average Budget of Animated Movie:  66465901.94871795


In [65]:
range(0,df_merged.shape[0])

range(0, 4775)

In [68]:
len(df_merged.crew_jobs)

4775

In [64]:
idx=[]
for x in range(0,df_merged.shape[0]):
    if len(df_merged.crew_jobs[x])>7:
        idx.append(x)
print("Number of Movies with more than 7 crew members: ",str(len(idx)))

df = df_merged.iloc[idx,:]

Number of Movies with more than 7 crew members:  3653


In [70]:
df.head(2)

Unnamed: 0,id,title,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,spoken_languages,status,tagline,vote_average,vote_count,crew_jobs,percent_of_voice_artists,labels,costume,lighting_dept
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,"In the 22nd century, a paraplegic Marine is di...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,7.2,11800,"[Editor, Production Design, Sound Designer, Su...",0.0,0.0,1,0
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,"Captain Barbossa, long believed to be dead, ha...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",6.9,4500,"[Director of Photography, Director, Producer, ...",0.029,0.0,1,1


In [71]:
AnimatedMoviesCount2 = np.sum(df['labels'] == 1)
NotAnimatedMoviesCount2 = np.sum(df['labels'] == 0)

print("Number of Animated Movies are: ", AnimatedMoviesCount2)
print("Number of Not Animated Movies are: ", NotAnimatedMoviesCount2)

Number of Animated Movies are:  193
Number of Not Animated Movies are:  3460


In [72]:
def join_strings(x):
    return ", ".join(x)

def str_lower(x):
    return x.lower()

df['crew_jobs'] = df['crew_jobs'].apply(join_strings)
df['crew_jobs'] = df['crew_jobs'].apply(str_lower)

In [73]:
df.head(2)

Unnamed: 0,id,title,cast,crew,budget,genres,homepage,keywords,original_language,overview,...,spoken_languages,status,tagline,vote_average,vote_count,crew_jobs,percent_of_voice_artists,labels,costume,lighting_dept
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...",237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,"[{'id': 1463, 'name': 'culture clash'}, {'id':...",en,"In the 22nd century, a paraplegic Marine is di...",...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,7.2,11800,"editor, production design, sound designer, sup...",0.0,0.0,1,0
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...",300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,"[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na...",en,"Captain Barbossa, long believed to be dead, ha...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",6.9,4500,"director of photography, director, producer, s...",0.029,0.0,1,1


In [74]:
df['labels'].value_counts()

0.0    3460
1.0     193
Name: labels, dtype: int64

### Modeling

In [75]:
X1 = df['crew_jobs']
Y1 = df['labels']

In [76]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.20, random_state=53)

In [78]:
print(X_train1.shape, y_train1.shape)
print(X_test1.shape, y_test1.shape)

(2922,) (2922,)
(731,) (731,)


In [79]:
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score

def score_output(y_test, y_pred):
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print('The Accuracy on The Test Set is: %s' % accuracy)

In [80]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [81]:
from spacy.lang.en import STOP_WORDS
stop_words_str = " ".join(STOP_WORDS)
stop_words_lemma = set(word.lemma_ for word in nlp(stop_words_str))

additional_words = ['editor', 'director', 'producer', 'writer', 'assistant', 'sound']

for word in additional_words:
    stop_words_lemma = stop_words_lemma.union({word})

In [83]:
def lemmatizer(text):
    return [word.lemma_ for word in nlp(text)]

### Without Stop Words

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [86]:
bow = TfidfVectorizer(ngram_range = (1,1))

pipe = Pipeline([('bag_of_words', bow),('classifier', SVC())])
pipe.fit(X_train1,y_train1)

print("Without Stop Words")
print('Training accuracy: {}'.format(pipe.score(X_train1,y_train1)))
y_pred = pipe.predict(X_test1)
score_output(y_test1, y_pred)

Without Stop Words
Training accuracy: 0.9873374401095141
[[699   1]
 [  8  23]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       700
         1.0       0.96      0.74      0.84        31

    accuracy                           0.99       731
   macro avg       0.97      0.87      0.91       731
weighted avg       0.99      0.99      0.99       731

The Accuracy on The Test Set is: 0.987688098495212


In [87]:
from sklearn.metrics import classification_report, recall_score, accuracy_score,precision_score, f1_score, confusion_matrix

In [88]:
# Model Evaluation
recall = recall_score(y_test1,y_pred)
accuracy = accuracy_score(y_test1,y_pred)
precision = precision_score(y_test1,y_pred)
f1score = f1_score(y_test1,y_pred)

In [97]:
print(recall)
print(accuracy)
print(precision)
print(f1score)

0.7419354838709677
0.987688098495212
0.9583333333333334
0.8363636363636364
