## Structure des fichiers tsv

knownForTitles = tconst

- name.basics :         nconst / primaryName / birthYear / deathYear / primaryProfession / knownForTitles
- title.akas :          titleId / ordering / title / region / language / types / attributes / isOriginalTitle
- title.basics :        tconst / titleType / primaryTitle / originalTitle / isAdult / startYear / endYear / runtimeMinutes / genres
- title.principals :    tconst / ordering / nconst / category / job / characters
- title.ratings :       tconst / averageRating / numVotes


## Nombre de lignes par échantillon

Nous allons utiliser pour chaque dataframe un maximum de 1000 lignes afin d'avoir un échantillon représentatif sans avoir de latence sur le traitement des données.

Projet Intermovie
Ce Notebook a pour but d'analyser un dataset de films dans le but de récupérer plusieurs informations :

La liste des acteurs par film.

La liste des films Américains (en gardant leur nom en français) et leur note moyenne.

Les notes moyennes des différents genres.

La note moyenne de chaque acteur par rapport aux films dans lesquels il apparaît.

In [2]:
# Extension IPython rechargeant les modules avant que l'utilisateur saisisse du code.
%load_ext autoreload
%autoreload 2

# Import des librairies.
import pandas as pd
import numpy as np

In [16]:
df = pd.read_table("./data/name.basics.tsv")

In [17]:
df.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0050419,tt0053137,tt0072308,tt0043044"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0071877,tt0117057,tt0038355,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0049189,tt0059956,tt0057345"
3,nm0000004,John Belushi,1949,1982,"actor,writer,soundtrack","tt0077975,tt0072562,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0069467,tt0050976,tt0083922,tt0050986"


In [24]:
df = df.drop(['birthYear', 'deathYear', 'primaryProfession', 'knownForTitles'], axis=1)
df = df.dropna(axis=0)
df = df.drop_duplicates()

In [25]:
df.head()

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


In [26]:
df.to_csv('csvactorsname.csv')

In [10]:
df2 = pd.read_table("./data/title.principals.tsv")

In [11]:
df2.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Herself""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [13]:
df2 = df2.dropna(axis=0)
df2 = df2[["tconst","nconst","category"]][df2['category'].str.contains('actor|actress|self', regex=True)]

In [14]:
df2.head()

Unnamed: 0,tconst,nconst,category
0,tt0000001,nm1588970,self
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor


In [27]:
df2.to_csv('csvtconstnconst.csv')

In [35]:
df3 = pd.read_table("./data/title.basics.tsv")


In [41]:
df3.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [42]:
df3 = df3.dropna(axis=0)
df3 = df3[["tconst","titleType","originalTitle"]][df3['titleType'].str.contains('movie', regex=True)]

In [43]:
df3.head()

Unnamed: 0,tconst,titleType,originalTitle
8,tt0000009,movie,Miss Jerry
145,tt0000147,movie,The Corbett-Fitzsimmons Fight
332,tt0000335,movie,Soldiers of the Cross
499,tt0000502,movie,Bohemios
571,tt0000574,movie,The Story of the Kelly Gang


In [44]:
df3 = df3.dropna(axis=0)
df3 = df3.drop_duplicates()

In [45]:
print (df3)

tconst titleType  \
8        tt0000009     movie   
145      tt0000147     movie   
332      tt0000335     movie   
499      tt0000502     movie   
571      tt0000574     movie   
...            ...       ...   
6321191  tt9916622     movie   
6321218  tt9916680     movie   
6321230  tt9916706     movie   
6321241  tt9916730     movie   
6321252  tt9916754     movie   

                                             originalTitle  
8                                               Miss Jerry  
145                          The Corbett-Fitzsimmons Fight  
332                                  Soldiers of the Cross  
499                                               Bohemios  
571                            The Story of the Kelly Gang  
...                                                    ...  
6321191        Rodolpho Teóphilo - O Legado de um Pioneiro  
6321218  De la ilusión al desconcierto: cine colombiano...  
6321230                                    Dankyavar Danka  
6321241          

In [32]:
dfmergeone = pd.merge(df,df2,on='nconst',how='left')

In [47]:
dfmergeone.head()

Unnamed: 0,nconst,primaryName,tconst,category
0,nm0000001,Fred Astaire,tt0025164,actor
1,nm0000001,Fred Astaire,tt0026942,actor
2,nm0000001,Fred Astaire,tt0027125,actor
3,nm0000001,Fred Astaire,tt0027630,actor
4,nm0000001,Fred Astaire,tt0028333,actor


In [48]:
dfmergetwo = pd.merge(dfmergeone, df3, on='tconst', how='left')

In [53]:
dfmergetwo.drop(columns=['category', 'titleType'], axis=1)

Unnamed: 0,nconst,primaryName,tconst,originalTitle
0,nm0000001,Fred Astaire,tt0025164,The Gay Divorcee
1,nm0000001,Fred Astaire,tt0026942,Roberta
2,nm0000001,Fred Astaire,tt0027125,Top Hat
3,nm0000001,Fred Astaire,tt0027630,Follow the Fleet
4,nm0000001,Fred Astaire,tt0028333,Swing Time
...,...,...,...,...
24905987,nm9993709,Lu Bevins,tt10484296,
24905988,nm9993713,Sambit Mishra,,
24905989,nm9993714,Romeo del Rosario,,
24905990,nm9993717,Harikrishnan Rajan,,


In [55]:
dfmergetwo = dfmergetwo.dropna(axis=0)
dfmergetwo = dfmergetwo.drop_duplicates()

In [57]:
print (dfmergetwo)

nconst                primaryName      tconst category titleType  \
0         nm0000001               Fred Astaire   tt0025164    actor     movie   
1         nm0000001               Fred Astaire   tt0026942    actor     movie   
2         nm0000001               Fred Astaire   tt0027125    actor     movie   
3         nm0000001               Fred Astaire   tt0027630    actor     movie   
4         nm0000001               Fred Astaire   tt0028333    actor     movie   
...             ...                        ...         ...      ...       ...   
24905929  nm9993636                Adam French   tt8983162    actor     movie   
24905932  nm9993636                Adam French   tt9668514    actor     movie   
24905943  nm9993650            Marcin Balcerak   tt8739208    actor     movie   
24905964  nm9993680  Christopher-Lawson Palmer  tt10427366    actor     movie   
24905965  nm9993680  Christopher-Lawson Palmer   tt8295580    actor     movie   

                originalTitle  
0       

In [58]:
df_final = dfmergetwo.groupby('originalTitle').agg({'primaryName': ','.join}, axis = 0)


In [61]:
df_final2 = dfmergetwo.groupby('primaryName').agg({'originalTitle': ','.join}, axis = 0)

In [62]:
df_final = df_final.to_csv('ArtistesParFilm.csv')
df_final2 = df_final2.to_csv('FilmsParArtistes.csv')