In [3]:
import pandas as pd

In [1]:
%%capture
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

In [4]:
cast = pd.read_csv("title.principals.tsv", sep="\t", na_values="\\N", keep_default_na=False, 
                   usecols=["tconst", "nconst", "category", "job"],
                   dtype={"tconst": str, "nconst": str, "category": str, "job": str})
titles = pd.read_csv("title.basics.tsv", sep="\t", na_values="\\N", keep_default_na=False,
                     usecols=["tconst", "primaryTitle", "startYear", "titleType"],
                    dtype={"tconst": str, "primaryTitle": str, "startYear": "Int64","titleType":str})
names = pd.read_csv("name.basics.tsv", sep="\t", na_values="\\N", keep_default_na=False,
                    usecols=["nconst", "primaryName"],
                   dtype={"nconst": str, "primaryName": str})

In [154]:
afi_female_legends = [
              "Katharine Hepburn", "Bette Davis", "Audrey Hepburn", "Ingrid Bergman",
              "Greta Garbo", "Marilyn Monroe", "Elizabeth Taylor", "Judy Garland",
              "Marlene Dietrich", "Joan Crawford", "Barbara Stanwyck", "Claudette Colbert",
              "Grace Kelly", "Ginger Rogers", "Mae West"
]
afi_male_legends = [
                    "Humphrey Bogart", "Cary Grant", "James Stewart", "Marlon Brando",
                    "Fred Astaire", "Henry Fonda", "Clark Gable", "James Cagney",
                    "Spencer Tracy", "Charles Chaplin", "Gary Cooper", "Gregory Peck",
                    "John Wayne", "Laurence Olivier", "Gene Kelly"
]

legend_actors = afi_female_legends + afi_male_legends

In [155]:
movie_titles = titles[titles.titleType == "movie"]

In [156]:
set_movies = pd.merge(
    pd.merge(
        names[names.primaryName.isin(legend_actors)],
        cast[cast.category.isin(["actor", "actress"])],
        on="nconst"
        ),
    titles[(titles.titleType == "movie") & (titles.startYear.notna())],
     on="tconst"
     )

In [158]:
print("All these actors have played in %i movies over their careers." %len(set_movies[["primaryTitle", "startYear"]].drop_duplicates()))

All these actors have played in 1448 movies over their careers.


In [180]:
combinations = set_movies[set_movies.groupby(["primaryTitle", "startYear"]).primaryName.transform("count") > 1]
one_each = set_movies.groupby("primaryName").first()
set_movies = pd.concat([combinations, one_each])
set_movies = set_movies.drop_duplicates(["primaryName", "category", "primaryTitle", "startYear"])
set_movies.head()

Unnamed: 0,nconst,primaryName,tconst,category,job,titleType,primaryTitle,startYear
0,nm0000001,Fred Astaire,tt0025164,actor,,movie,The Gay Divorcee,1934
1,nm0001677,Ginger Rogers,tt0025164,actress,,movie,The Gay Divorcee,1934
2,nm0000001,Fred Astaire,tt0026942,actor,,movie,Roberta,1935
3,nm0001677,Ginger Rogers,tt0026942,actress,,movie,Roberta,1935
4,nm0000001,Fred Astaire,tt0027125,actor,,movie,Top Hat,1935


In [187]:
print("We are considering %i movies for this problem by keeping only one movie for each actor where they appeared alone." 
      %len(set_movies[["primaryTitle", "startYear"]].drop_duplicates()))

We are considering 143 movies for this problem by keeping only one movie for each actor where they appeared alone.


In [181]:
def greedy_cover(movies):
    min_set = []
    while not movies.empty:
        title, year = movies.groupby(["primaryTitle", "startYear"]).primaryName.count().idxmax()
        min_set.append((title, year))
        actors_selected = movies[(movies.primaryTitle == title) & (movies.startYear == year)].primaryName
        movies = movies[~movies.primaryName.isin(actors_selected)]
    return min_set

In [182]:
min_set = greedy_cover(set_movies)
print("The greedy algorithm returned a min set of length %i." %len(min_set))
print(min_set)

The greedy algorithm returned a min set of length 17.
[('How the West Was Won', 1962), ('Boom Town', 1940), ('Monkey Business', 1952), ('Angels with Dirty Faces', 1938), ('Ball of Fire', 1941), ('Easter Parade', 1948), ('Grand Hotel', 1932), ('Reflections in a Golden Eye', 1967), ('20,000 Years in Sing Sing', 1932), ("Adam's Rib", 1949), ('Blonde Venus', 1932), ('Casablanca', 1942), ('Charade', 1963), ('For Me and My Gal', 1942), ('High Noon', 1952), ("I'm No Angel", 1933), ('The Boys from Brazil', 1978)]


In [183]:
movies_matrix_df = pd.concat([set_movies, pd.get_dummies(set_movies.primaryName)], axis=1)
movies_matrix_df = movies_matrix_df.groupby(["primaryTitle", "startYear"]).sum()
movies_matrix_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Audrey Hepburn,Barbara Stanwyck,Bette Davis,Cary Grant,Clark Gable,Claudette Colbert,Elizabeth Taylor,Fred Astaire,Gary Cooper,Gene Kelly,Ginger Rogers,Grace Kelly,Gregory Peck,Greta Garbo,Henry Fonda,Humphrey Bogart,Ingrid Bergman,James Cagney,James Stewart,Joan Crawford,John Wayne,Judy Garland,Katharine Hepburn,Laurence Olivier,Mae West,Marilyn Monroe,Marlene Dietrich,Marlon Brando,Spencer Tracy
primaryTitle,startYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
"20,000 Years in Sing Sing",1932,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Adam's Rib,1949,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
Angels with Dirty Faces,1938,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
Ash Wednesday,1973,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bad Sister,1931,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [184]:
movies_matrix = movies_matrix_df.to_numpy().T

In [124]:
from scipy.optimize import linprog
import numpy as np

In [188]:
n_actors, n_movies = movies_matrix.shape

In [189]:
c = np.ones(n_movies)
A_ub = -1 * movies_matrix
b_ub = -1 * np.ones(n_actors)
l = np.zeros(n_movies)
u = np.ones(n_movies)
res = linprog(c=c, A_ub=A_ub, b_ub=b_ub, bounds=(0, 1))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [190]:
movies_matrix_df.iloc[res.x > 0.5].sum(axis=0).sum()

23

In [191]:
movies_matrix_df.iloc[res.x > 0.5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Audrey Hepburn,Barbara Stanwyck,Bette Davis,Cary Grant,Clark Gable,Claudette Colbert,Elizabeth Taylor,Fred Astaire,Gary Cooper,Gene Kelly,Ginger Rogers,Grace Kelly,Gregory Peck,Greta Garbo,Henry Fonda,Humphrey Bogart,Ingrid Bergman,James Cagney,James Stewart,Joan Crawford,John Wayne,Judy Garland,Katharine Hepburn,Laurence Olivier,Mae West,Marilyn Monroe,Marlene Dietrich,Marlon Brando,Spencer Tracy
primaryTitle,startYear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
Boom Town,1940,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
Casablanca,1942,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
Funny Face,1957,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Grand Hotel,1932,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
High Noon,1952,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
How the West Was Won,1962,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
Reflections in a Golden Eye,1967,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
She Done Him Wrong,1933,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Stage Door,1937,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
The Prince and the Showgirl,1957,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
