In [1]:
import pandas as pd

# Load IMDb title basics and principals data
title_basics = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False)
title_principals = pd.read_csv("title.principals.tsv", sep="\t", low_memory=False)
name_basics = pd.read_csv("name.basics.tsv", sep="\t", low_memory=False)

# Filter for only movies
movies = title_basics[title_basics['titleType'] == 'movie']

# Merge title and principal data to get actor roles
movie_actors = pd.merge(title_principals, movies[['tconst', 'primaryTitle']], on='tconst')

# Merge with name_basics to get actor names
movie_actors = pd.merge(movie_actors, name_basics[['nconst', 'primaryName']], on='nconst')

# Filter necessary columns (movie name, actor name, and characters played)
movie_actor_roles = movie_actors[['primaryTitle', 'primaryName', 'characters']]

# Clean up the characters column and filter out rows with '\N'
movie_actor_roles['characters'] = movie_actor_roles['characters'].str.replace(r'[\"\[\]]', '', regex=True)
movie_actor_roles = movie_actor_roles[movie_actor_roles['characters'] != '\\N']
movie_actor_roles = movie_actor_roles[movie_actor_roles['characters'] != 'self']

# Generate sentences
movie_actor_roles['sentence'] = movie_actor_roles.apply(
    lambda row: f"In {row['primaryTitle']}, {row['primaryName']} played as ", axis=1)

# Use the characters as labels
movie_actor_roles['label'] = movie_actor_roles['characters']

# Save to CSV with labels
movie_actor_roles[['sentence', 'label']].to_csv('movie_actor_sentences_with_labels.csv', index=False)

# Print some examples
print(movie_actor_roles[['sentence', 'label']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_actor_roles['characters'] = movie_actor_roles['characters'].str.replace(r'[\"\[\]]', '', regex=True)


                                            sentence  \
0          In Miss Jerry, Blanche Bayliss played as    
1        In Miss Jerry, William Courtenay played as    
2           In Miss Jerry, Chauncey Depew played as    
7  In The Corbett-Fitzsimmons Fight, James J. Cor...   
8  In The Corbett-Fitzsimmons Fight, Bob Fitzsimm...   

                                               label  
0               Miss Geraldine Holbrook (Miss Jerry)  
1                                       Mr. Hamilton  
2  Chauncey Depew - the Director of the New York ...  
7                                               Self  
8                                               Self  


In [10]:
movie_actor_roles[:50]

Unnamed: 0,primaryTitle,primaryName,characters,sentence
0,Miss Jerry,Blanche Bayliss,Miss Geraldine Holbrook (Miss Jerry),"In Miss Jerry, Blanche Bayliss played as Miss ..."
1,Miss Jerry,William Courtenay,Mr. Hamilton,"In Miss Jerry, William Courtenay played as Mr...."
2,Miss Jerry,Chauncey Depew,Chauncey Depew - the Director of the New York ...,"In Miss Jerry, Chauncey Depew played as Chaunc..."
7,The Corbett-Fitzsimmons Fight,James J. Corbett,Self,"In The Corbett-Fitzsimmons Fight, James J. Cor..."
8,The Corbett-Fitzsimmons Fight,Bob Fitzsimmons,Self,"In The Corbett-Fitzsimmons Fight, Bob Fitzsimm..."
9,The Corbett-Fitzsimmons Fight,Billy Madden,Self - Sullivan's Manager,"In The Corbett-Fitzsimmons Fight, Billy Madden..."
10,The Corbett-Fitzsimmons Fight,George Siler,Self - Referee,"In The Corbett-Fitzsimmons Fight, George Siler..."
11,The Corbett-Fitzsimmons Fight,John L. Sullivan,Self - Master of Ceremonies,"In The Corbett-Fitzsimmons Fight, John L. Sull..."
22,The Story of the Kelly Gang,Elizabeth Tait,Kate Kelly,"In The Story of the Kelly Gang, Elizabeth Tait..."
23,The Story of the Kelly Gang,John Tait,School Master,"In The Story of the Kelly Gang, John Tait play..."


In [4]:
movie_actors

Unnamed: 0,tconst,ordering,nconst,category,job,characters,primaryTitle,primaryName
0,tt0000009,1,nm0063086,actress,\N,"[""Miss Geraldine Holbrook (Miss Jerry)""]",Miss Jerry,Blanche Bayliss
1,tt0000009,2,nm0183823,actor,\N,"[""Mr. Hamilton""]",Miss Jerry,William Courtenay
2,tt0000009,3,nm1309758,actor,\N,"[""Chauncey Depew - the Director of the New Yor...",Miss Jerry,Chauncey Depew
3,tt0000009,4,nm0085156,director,\N,\N,Miss Jerry,Alexander Black
4,tt0000009,5,nm0085156,writer,\N,\N,Miss Jerry,Alexander Black
...,...,...,...,...,...,...,...,...
7976167,tt9916754,16,nm10538638,cinematographer,cinematographer,\N,Chico Albuquerque - Revelações,Wellington Barros
7976168,tt9916754,17,nm8349149,cinematographer,director of photography,\N,Chico Albuquerque - Revelações,Vinicius Augusto Bozzo
7976169,tt9916754,18,nm10538635,cinematographer,cinematographer,\N,Chico Albuquerque - Revelações,Odério Dias
7976170,tt9916754,19,nm8349149,editor,supervising editor,\N,Chico Albuquerque - Revelações,Vinicius Augusto Bozzo
