In [2]:
import os
import json
import pandas as pd
import pandas as pd

In [9]:
json_path = '../../data/dataset/movienet/images.json'

In [10]:
with open(json_path, 'r') as fp:
    img_meta_data = json.load(fp)

In [14]:
df = pd.DataFrame(img_meta_data)

In [32]:
df['num_people_image'] = df.cast.map(len)

In [81]:
df.type.value_counts()

still_frame          1391143
publicity            1066695
event                 354859
poster                169417
behind_the_scenes      97626
product                41333
production_art         34326
unknown                  113
user_avatar               20
Name: type, dtype: int64

# Let's prioritize the images

## Poster images of a single persone

In [75]:
df1 = df[(df.type == 'poster') & (df.num_people_image == 1)]

In [76]:
df1 = df1[['cast', 'img']].explode(['cast']).drop_duplicates('cast').set_index('cast').sort_index()
df1.columns = ['img_id']
df1['img_id'] = df1['img_id'].str.split('.jpg').str[0]

In [78]:
df1.to_csv('imdb_to_movinet.csv')

In [79]:
df1.head()

Unnamed: 0_level_0,img_id
cast,Unnamed: 1_level_1
nm0000001,1402/rm3023015424
nm0000002,1759/rm430181120
nm0000003,1905/rm4250693888
nm0000004,1933/rm232917504
nm0000006,1482/rm1706780928


## Product images of a single person

In [85]:
df2 = df[(df.type == 'product') & (df.num_people_image == 1)]
df2 = df2[['cast', 'img']].explode(['cast']).drop_duplicates('cast').set_index('cast').sort_index()

In [93]:
df2 = df2.loc[df2.index.difference(df1.index)]

In [95]:
df2.columns = ['img_id']
df2['img_id'] = df2['img_id'].str.split('.jpg').str[0]

In [100]:
df_out = pd.concat([df1, df2], axis=0).sort_index()

In [102]:
df_out.to_csv('imdb_to_movinet.csv')

## Publicity images of a single person

In [111]:
df_out = pd.read_csv('imdb_to_movinet.csv', index_col='cast')

In [114]:
df_out.head()

Unnamed: 0_level_0,img_id
cast,Unnamed: 1_level_1
nm0000001,1402/rm3023015424
nm0000002,1759/rm430181120
nm0000003,1905/rm4250693888
nm0000004,1933/rm232917504
nm0000005,3068/rm746130176


Unnamed: 0_level_0,img_id
cast,Unnamed: 1_level_1
nm0000001,1402/rm3023015424
nm0000002,1759/rm430181120
nm0000003,1905/rm4250693888
nm0000004,1933/rm232917504
nm0000005,3068/rm746130176


In [118]:
df3 = df[(df.type == 'publicity') & (df.num_people_image == 1)]
df3 = df3[['cast', 'img']].explode(['cast']).drop_duplicates('cast').set_index('cast').sort_index()
df3.shape

(84345, 1)

In [123]:
df3 = df3.loc[df3.index.difference(df_out.index)]

In [125]:
df3.shape

(72666, 1)

In [126]:
df3.columns = ['img_id']
df3['img_id'] = df3['img_id'].str.split('.jpg').str[0]

In [127]:
df_out = pd.concat([df_out, df3], axis=0).sort_index()
df_out.to_csv('imdb_to_movinet.csv')

In [130]:
df_out.to_csv('imdb_to_movinet.csv')

## Repeat it for the other categories

In [134]:
def add_more_ids(category, df_out_path='imdb_to_movinet.csv', output_path='imdb_to_movinet.csv'):
    df_out_ = pd.read_csv(df_out_path, index_col='cast')
    df_new = df[(df.type == category) & (df.num_people_image == 1)]
    df_new = df_new[['cast', 'img']].explode(['cast']).drop_duplicates('cast').set_index('cast').sort_index()
    print(df_new.shape)

    # Remove overlap
    df_new = df_new.loc[df_new.index.difference(df_out.index)]
    print(df_new.shape)

    # Rename and organize add_more_idsdf3.columns = ['img_id']
    df_new.columns = ['img_id']
    df_new['img_id'] = df_new['img_id'].str.split('.jpg').str[0]
    df_out_ = pd.concat([df_out_, df_new], axis=0).sort_index()
    print(f'new_number_of_rows: {df_out_.shape}')
    df_out_.to_csv(output_path)

In [132]:
add_more_ids(category='event', df_out_path='imdb_to_movinet.csv', output_path='imdb_to_movinet_2.csv')

(24547, 1)
(9590, 1)


In [133]:
add_more_ids(category='still_frame', df_out_path='imdb_to_movinet_2.csv', output_path='imdb_to_movinet_2.csv')

(77628, 1)
(36414, 1)


In [135]:
add_more_ids(category='behind_the_scenes', df_out_path='imdb_to_movinet_2.csv', output_path='imdb_to_movinet_2.csv')

(16464, 1)
(6643, 1)
new_number_of_rows: (148464, 1)


In [136]:
add_more_ids(category='production_art', df_out_path='imdb_to_movinet_2.csv', output_path='imdb_to_movinet_2.csv')

(5747, 1)
(1347, 1)
new_number_of_rows: (149811, 1)


# Check total number of entities left

In [182]:
df_out = pd.read_csv('imdb_to_movinet_2.csv', index_col='cast')

In [183]:
df_out

Unnamed: 0_level_0,img_id
cast,Unnamed: 1_level_1
nm0000001,1402/rm3023015424
nm0000002,1759/rm430181120
nm0000003,1905/rm4250693888
nm0000004,1933/rm232917504
nm0000005,3068/rm746130176
...,...
nm9989523,2731/rm2416002048
nm9990433,3661/rm1863679232
nm9992386,2714/rm1937731840
nm9993278,1682/rm1570599936


In [184]:
total_imdb_ids = pd.Index(df.cast.explode('cast').unique())

In [185]:
ids_left = total_imdb_ids.difference(df_out.index)

In [187]:
df_left = df[['cast', 'img', 'num_people_image']].explode('cast').set_index('cast').sort_index().loc[ids_left]

In [188]:
imgs_left_df = df_left.reset_index().sort_values(['index', 'num_people_image']).drop_duplicates('index')

In [189]:
imgs_left_df.drop('num_people_image', axis=1, inplace=True)

In [190]:
imgs_left_df.columns = ['cast', 'img_id']

In [191]:
imgs_left_df

Unnamed: 0,cast,img_id
1,nm0000055,0762/rm1617664000.jpg
2,nm0000065,2246/rm3357233152.jpg
4,nm0000085,0563/rm120690688.jpg
5,nm0000390,2037/rm442014464.jpg
7,nm0000540,2090/rm3340849664.jpg
...,...,...
303984,nm9993276,1510/rm523008768.jpg
303985,nm9993277,2339/rm774194432.jpg
303986,nm9993571,3903/rm1495494656.jpg
303988,nm9993616,3590/rm3214113792.jpg


In [192]:
imgs_left_df['img_id'] = imgs_left_df['img_id'].str.split('.jpg').str[0]

In [193]:
df_out = pd.concat([df_out, imgs_left_df.set_index('cast')], axis=0).sort_index()

In [202]:
df_out.reset_index().drop_duplicates('cast').set_index('cast').sort_index().to_csv('imdb_to_movinet_final.csv')

In [194]:
len(set(df_out.index.get_level_values(0)))

291742

In [195]:
len(set(df_out.index.get_level_values(0)))

291742

Dump to json

In [205]:
df_out.img_id.to_dict()

{'nm0000001': '1402/rm3023015424',
 'nm0000002': '1759/rm430181120',
 'nm0000003': '1905/rm4250693888',
 'nm0000004': '1933/rm232917504',
 'nm0000005': '3068/rm746130176',
 'nm0000006': '1482/rm1706780928',
 'nm0000007': '2227/rm64341504',
 'nm0000008': '1996/rm1206482688',
 'nm0000009': '1182/rm1325218816',
 'nm0000010': '1623/rm955532800',
 'nm0000011': '2200/rm1965298944',
 'nm0000012': '2681/rm487323904',
 'nm0000013': '1154/rm1889614592',
 'nm0000014': '2339/rm3770354176',
 'nm0000015': '3185/rm2650344960',
 'nm0000017': '2629/rm3058499072',
 'nm0000018': '3869/rm1403519488',
 'nm0000019': '0002/rm225638912',
 'nm0000020': '1451/rm64349184',
 'nm0000021': '2009/rm3192075776',
 'nm0000022': '2419/rm887452416',
 'nm0000023': '0497/rm259522304',
 'nm0000024': '1543/rm447749632',
 'nm0000025': '2616/rm2263023616',
 'nm0000026': '0865/rm2154371584',
 'nm0000027': '3585/rm3637217792',
 'nm0000028': '1288/rm322393856',
 'nm0000029': '2702/rm2814257920',
 'nm0000030': '3723/rm1157143040',

In [206]:
import json
with open('imdb2movienet.json', 'w') as fp:
    json.dump(df_out.img_id.to_dict(), fp)

In [4]:
a = json.load(open('imdb2movienet.json', 'r'))

In [5]:
a['nm0000770']

'3710/rm4122590464'