In [30]:
import pandas as pd
import ast

dfc = pd.read_csv("../data/raw/tmdb_6000/tmdb_6000_movie_credits.csv")

# Drop useless index column
dfc = dfc.drop(columns=["Unnamed: 0"])

dfc.head(2)


Unnamed: 0,tmdbId,cast,crew
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [32]:
def to_list_of_dicts(x):
    if pd.isna(x):
        return []
    try:
        data = ast.literal_eval(x)
        if isinstance(data, list):
            return [d for d in data if isinstance(d, dict)]
    except:
        pass
    return []

dfc["cast_list"] = dfc["cast"].apply(to_list_of_dicts)
dfc["crew_list"] = dfc["crew"].apply(to_list_of_dicts)



In [33]:

dfc[["cast_list", "crew_list"]].loc[0]


cast_list    [{'cast_id': 242, 'character': 'Jake Sully', '...
crew_list    [{'credit_id': '52fe48009251416c750aca23', 'de...
Name: 0, dtype: object

In [34]:
def extract_top_cast(cast_list, n=5):
    """
    Function to extract top cast members
    """
    if not isinstance(cast_list, list):
        return []
        
    # Sort by "order" if it exists, otherwise fallback
    try:
        sorted_cast = sorted(cast_list, key=lambda d: d.get("order", 9999))
    except:
        sorted_cast = cast_list

    names = [d.get("name") for d in sorted_cast if "name" in d]
    return names[:n]

dfc["top_cast"] = dfc["cast_list"].apply(extract_top_cast)

dfc[["tmdbId", "top_cast"]].head(5)


Unnamed: 0,tmdbId,top_cast
0,19995,"[Sam Worthington, Zoe Saldana, Sigourney Weave..."
1,285,"[Johnny Depp, Orlando Bloom, Keira Knightley, ..."
2,206647,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R..."
3,49026,"[Christian Bale, Michael Caine, Gary Oldman, A..."
4,49529,"[Taylor Kitsch, Lynn Collins, Samantha Morton,..."


In [35]:
def extract_by_job(crew_list, jobs):
    results = []
    for d in crew_list:
        job = d.get("job", "").lower()
        name = d.get("name", "")
        if not name:
            continue
        for j in jobs:
            if job == j.lower():
                results.append(name)
    # remove duplicates while keeping order
    return list(dict.fromkeys(results))


In [36]:
dfc["directors"] = dfc["crew_list"].apply(lambda c: extract_by_job(c, ["Director"]))
dfc["writers"]   = dfc["crew_list"].apply(lambda c: extract_by_job(c, ["Writer", "Screenplay", "Screenplay By", "Author", "Story"]))
dfc["producers"] = dfc["crew_list"].apply(lambda c: extract_by_job(c, ["Producer"]))


In [37]:
dfc[["tmdbId", "directors", "writers"]].head(10)
dfc.head()

Unnamed: 0,tmdbId,cast,crew,cast_list,crew_list,top_cast,directors,writers,producers
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],[James Cameron],"[James Cameron, Jon Landau]"
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],"[Ted Elliott, Terry Rossio]","[Jerry Bruckheimer, Eric McLeod, Chad Oman, Pe..."
2,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes],"[John Logan, Robert Wade, Neal Purvis, Jez But...","[Barbara Broccoli, Michael G. Wilson]"
3,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan],"[Christopher Nolan, Jonathan Nolan, David S. G...","[Charles Roven, Christopher Nolan, Emma Thomas]"
4,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton],"[Andrew Stanton, Michael Chabon, Mark Andrews]","[Colin Wilson, Jim Morris, Lindsey Collins]"


In [38]:
for col in ["top_cast", "directors", "writers", "producers"]:
    dfc[col + "_str"] = dfc[col].apply(lambda x: ", ".join(x) if isinstance(x, list) else "")


In [40]:
dfc.columns

Index(['tmdbId', 'cast', 'crew', 'cast_list', 'crew_list', 'top_cast',
       'directors', 'writers', 'producers', 'top_cast_str', 'directors_str',
       'writers_str', 'producers_str'],
      dtype='object')

In [41]:
dfc

Unnamed: 0,tmdbId,cast,crew,cast_list,crew_list,top_cast,directors,writers,producers,top_cast_str,directors_str,writers_str,producers_str
0,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'credit_id': '52fe48009251416c750aca23', 'de...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],[James Cameron],"[James Cameron, Jon Landau]","Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron,James Cameron,"James Cameron, Jon Landau"
1,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'credit_id': '52fe4232c3a36847f800b579', 'de...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski],"[Ted Elliott, Terry Rossio]","[Jerry Bruckheimer, Eric McLeod, Chad Oman, Pe...","Johnny Depp, Orlando Bloom, Keira Knightley, S...",Gore Verbinski,"Ted Elliott, Terry Rossio","Jerry Bruckheimer, Eric McLeod, Chad Oman, Pet..."
2,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '54805967c3a36829b5002c41', 'de...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes],"[John Logan, Robert Wade, Neal Purvis, Jez But...","[Barbara Broccoli, Michael G. Wilson]","Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...",Sam Mendes,"John Logan, Robert Wade, Neal Purvis, Jez Butt...","Barbara Broccoli, Michael G. Wilson"
3,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{'credit_id': '52fe4781c3a36847f81398c3', 'de...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan],"[Christopher Nolan, Jonathan Nolan, David S. G...","[Charles Roven, Christopher Nolan, Emma Thomas]","Christian Bale, Michael Caine, Gary Oldman, An...",Christopher Nolan,"Christopher Nolan, Jonathan Nolan, David S. Goyer","Charles Roven, Christopher Nolan, Emma Thomas"
4,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{'cast_id': 5, 'character': 'John Carter', 'c...","[{'credit_id': '52fe479ac3a36847f813eaa3', 'de...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton],"[Andrew Stanton, Michael Chabon, Mark Andrews]","[Colin Wilson, Jim Morris, Lindsey Collins]","Taylor Kitsch, Lynn Collins, Samantha Morton, ...",Andrew Stanton,"Andrew Stanton, Michael Chabon, Mark Andrews","Colin Wilson, Jim Morris, Lindsey Collins"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5793,626332,"[{'adult': False, 'gender': 2, 'id': 66525, 'k...","[{'adult': False, 'gender': 2, 'id': 52035, 'k...","[{'adult': False, 'gender': 2, 'id': 66525, 'k...","[{'adult': False, 'gender': 2, 'id': 52035, 'k...","[Jesse Garcia, Annie Gonzalez, Emilio Rivera, ...",[Eva Longoria],"[Lewis Colick, Linda Yvette Chavez]",[DeVon Franklin],"Jesse Garcia, Annie Gonzalez, Emilio Rivera, V...",Eva Longoria,"Lewis Colick, Linda Yvette Chavez",DeVon Franklin
5794,1094319,"[{'adult': False, 'gender': 2, 'id': 72440, 'k...","[{'adult': False, 'gender': 2, 'id': 59502, 'k...","[{'adult': False, 'gender': 2, 'id': 72440, 'k...","[{'adult': False, 'gender': 2, 'id': 59502, 'k...","[Brendan Fehr, Dolph Lundgren, Luke Wilson, Ni...",[Shane Dax Taylor],"[Shane Dax Taylor, Daniel Zirilli, C. Alec Ros...",[],"Brendan Fehr, Dolph Lundgren, Luke Wilson, Nic...",Shane Dax Taylor,"Shane Dax Taylor, Daniel Zirilli, C. Alec Rossel",
5795,998623,"[{'adult': False, 'gender': 0, 'id': 2791253, ...","[{'adult': False, 'gender': 2, 'id': 216023, '...","[{'adult': False, 'gender': 0, 'id': 2791253, ...","[{'adult': False, 'gender': 2, 'id': 216023, '...","[Orlando Vauthier, Axel Granberger, Camille Lé...",[Olivier Abbou],"[Olivier Abbou, Thibault Lang-Willar, Mathilde...","[Bruno Merle, Olivier Abbou, Noor ""Rize"" Sadar...","Orlando Vauthier, Axel Granberger, Camille Léo...",Olivier Abbou,"Olivier Abbou, Thibault Lang-Willar, Mathilde ...","Bruno Merle, Olivier Abbou, Noor ""Rize"" Sadar,..."
5796,1155770,"[{'adult': False, 'gender': 2, 'id': 20422, 'k...","[{'adult': False, 'gender': 2, 'id': 1551, 'kn...","[{'adult': False, 'gender': 2, 'id': 20422, 'k...","[{'adult': False, 'gender': 2, 'id': 1551, 'kn...","[John le Carré, Jake Dove, Charlotte Hamblin, ...",[Errol Morris],[Errol Morris],"[Errol Morris, Stephen Cornwell, Dominic Cross...","John le Carré, Jake Dove, Charlotte Hamblin, G...",Errol Morris,Errol Morris,"Errol Morris, Stephen Cornwell, Dominic Crossl..."


In [42]:
# Replace list columns with the clean string columns

final_df = pd.DataFrame({
    "tmdbId": dfc["tmdbId"],
    "top_cast": dfc["top_cast_str"],
    "directors": dfc["directors_str"],
    "writers": dfc["writers_str"],
    "producers": dfc["producers_str"]
})

In [43]:
final_df.head()

Unnamed: 0,tmdbId,top_cast,directors,writers,producers
0,19995,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron,James Cameron,"James Cameron, Jon Landau"
1,285,"Johnny Depp, Orlando Bloom, Keira Knightley, S...",Gore Verbinski,"Ted Elliott, Terry Rossio","Jerry Bruckheimer, Eric McLeod, Chad Oman, Pet..."
2,206647,"Daniel Craig, Christoph Waltz, Léa Seydoux, Ra...",Sam Mendes,"John Logan, Robert Wade, Neal Purvis, Jez Butt...","Barbara Broccoli, Michael G. Wilson"
3,49026,"Christian Bale, Michael Caine, Gary Oldman, An...",Christopher Nolan,"Christopher Nolan, Jonathan Nolan, David S. Goyer","Charles Roven, Christopher Nolan, Emma Thomas"
4,49529,"Taylor Kitsch, Lynn Collins, Samantha Morton, ...",Andrew Stanton,"Andrew Stanton, Michael Chabon, Mark Andrews","Colin Wilson, Jim Morris, Lindsey Collins"


In [45]:
# Save the final clean credits dataset
final_df.to_csv("../data/processed/credits_clean.csv", index=False)

In [46]:
final_df.size

28990