# Intermovie - projet 6 
Utilisation de dataframes pour manipuler des données. Exportation en CSV pour chaque dataframe créé. 


## Import des librairies

In [3]:
import pandas as pd

from my_timer import MyTimer
timer = MyTimer()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import et nettoyage des datasets

In [4]:
timer.start()
title_principals = pd.read_csv("../movies_dataset/title.principals.tsv", sep='\t', index_col='tconst', usecols=['nconst', 'tconst', 'category'], encoding='utf-8')
timer.stop("Chargement de title.principals.tsv :")

timer.start()
title_ratings = pd.read_csv("../movies_dataset/title.ratings.tsv", sep='\t', index_col='tconst', usecols=['tconst', 'averageRating'], encoding='utf-8')
timer.stop("Chargement de title.ratings.tsv :")

timer.start()
title_basics = pd.read_csv("../movies_dataset/title.basics.tsv", sep='\t', index_col='tconst', usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'genres'], encoding='utf-8')
timer.stop("Chargement de title.basics.tsv :")

timer.start()
title_akas = pd.read_csv("../movies_dataset/title.akas.tsv", sep='\t', index_col='titleId', usecols=['titleId', 'region'], encoding='utf-8')
timer.stop("Chargement title.akas.tsv :")

timer.start()
name_basics = pd.read_csv("../movies_dataset/name.basics.tsv", sep='\t', index_col='nconst', usecols=['nconst', 'primaryName', 'primaryProfession'], encoding='utf-8')
timer.stop("Chargement name.basics.tsv :")


Chargement de title.principals.tsv :
Elapsed time: 77.3380 seconds
Chargement de title.ratings.tsv :
Elapsed time: 2.0967 seconds
Chargement de title.basics.tsv :
Elapsed time: 58.4367 seconds
Chargement title.akas.tsv :
Elapsed time: 26.3381 seconds
Chargement name.basics.tsv :
Elapsed time: 31.9359 seconds


In [5]:
title_principals = title_principals.dropna()
title_principals = title_principals[['nconst']][title_principals['category'].str.contains("actor|actress", regex=True)]

title_ratings = title_ratings.dropna()
title_ratings = title_ratings[['averageRating']]

title_basics = title_basics[title_basics.genres != '\\N'].dropna()
title_basics = title_basics[title_basics.titleType.str.contains('movie|tvMovie', regex=True)]

title_akas = title_akas.dropna()
title_akas = title_akas[title_akas['region'].str.contains('FR')]
title_akas = title_akas.drop(columns=['region'])
title_akas = pd.DataFrame(index=title_akas.index.drop_duplicates())

name_basics = name_basics.dropna()
name_basics = name_basics[['primaryName']][name_basics.primaryProfession.str.contains('actor|actress', regex=True)]


## 1. Sélection de la liste des acteurs par films

In [6]:
actors_movies = title_principals.merge(name_basics, right_index=True, left_on=['nconst'], how='left')
actors_movies = actors_movies.merge(title_basics[['primaryTitle']], right_index=True, left_on=['tconst'], how='left')

actors_movies = actors_movies.dropna()
actors_movies = actors_movies[['primaryTitle', 'primaryName']]
actors_movies = actors_movies.groupby('primaryTitle').agg({'primaryName': ', '.join})

# Export en CSV
actors_movies.to_csv('actors_movies.csv')

## 2. La liste des films Américains (en gardant leur nom en français) et leur note moyenne

In [7]:
american_movies = title_akas.merge(title_basics[['originalTitle']], how='left', left_index=True, right_index=True)
american_movies = american_movies.dropna()

american_movies = american_movies.merge(title_ratings, how='left', left_index=True, right_index=True)
american_movies = american_movies.dropna()

# Export en CSV :
american_movies.to_csv('american_movies.csv', index=False)


## 3. La note moyenne des différents genres

In [8]:
temporary = title_basics[['genres']].merge(title_ratings, how='left', left_index=True, right_index=True)
temporary = temporary.dropna()
temporary = temporary[temporary.genres != '\\N']

mean_genre = pd.DataFrame(temporary.genres.str.split(',').tolist(), index=temporary.index).stack()
mean_genre = mean_genre.reset_index(['tconst', 0])

temporary = temporary.drop(columns=['genres'])

mean_genre = mean_genre.set_index('tconst')
mean_genre.columns = ['genre']

mean_genre = mean_genre.merge(temporary, how='left', left_index=True, right_index=True)
mean_genre = mean_genre[mean_genre.genre != '\\N']

mean_genre = mean_genre.groupby('genre').mean()

mean_genre = mean_genre.sort_values(by=['averageRating'], ascending=False)

mean_genre['averageRating'] = mean_genre['averageRating'].map('{:,.2f}'.format)

# Export en CSV :
mean_genre.to_csv('mean_genre.csv')

## 4. La note moyenne de chaque acteur par rapport aux films dans lesquels il apparaît

In [9]:
actors_ranking = title_principals[['nconst']].merge(title_basics[['titleType']], left_index=True, right_index=True, how='left').drop(columns='titleType').dropna()
actors_ranking = actors_ranking.merge(name_basics, left_on='nconst', right_index=True, how='left').dropna()
actors_ranking = actors_ranking.merge(title_ratings, left_index=True, right_on=['tconst'], how='left')[['averageRating', 'primaryName']].dropna()
actors_ranking = actors_ranking.groupby('primaryName').agg({'averageRating': 'mean'})

actors_ranking['averageRating'] = actors_ranking['averageRating'].map('{:,.2f}'.format)

# Export en CSV :
actors_ranking.to_csv('rating_actors.csv')