# Intermovie - projet 6 
Utilisation de dataframes pour manipuler des données. Exportation en CSV pour chaque dataframe créé. 


## Import des librairies

In [2]:
import pandas as pd
import numpy as np

from MODULES.loader import IntermovieDataLoader
from MODULES.prediction import IntermoviePrediction
from MODULES.my_timer import MyTimer
timer = MyTimer()

%load_ext autoreload
%autoreload 2

## Paramètrage général

In [None]:
DATASET_PATH = '../datas/movies_dataset/'
CURATED_PATH = '../datas/CURATED/'
WORKS = 'WORKS/'
FORMATS = 'FORMATS/'
REGIONS = 'REGIONS/'
nb_votes = 500

data_loader = IntermovieDataLoader()

## Import et nettoyage des datasets

In [None]:
# Téléchargement et dezip du dataset
data_loader.ensure_data_loaded()

In [15]:
# Import des datasets originaux.
timer.start()
title_principals = pd.read_csv(DATASET_PATH + "title.principals.tsv", sep='\t', index_col='tconst', usecols=['nconst', 'tconst', 'category'], encoding='utf-8')
timer.stop("Title.principals.tsv importé.")

timer.start()
title_ratings = pd.read_csv(DATASET_PATH + "title.ratings.tsv", sep='\t', index_col='tconst', encoding='utf-8')
timer.stop("\nTitle.ratings.tsv importé.")

timer.start()
title_basics = pd.read_csv(DATASET_PATH + "title.basics.tsv", sep='\t', index_col='tconst', usecols=['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'genres'], encoding='utf-8')
timer.stop("\nTitle.basics.tsv importé.")

timer.start()
title_akas = pd.read_csv(DATASET_PATH + "title.akas.tsv", sep='\t', index_col='titleId', usecols=['titleId', 'title', 'region', 'isOriginalTitle'], encoding='utf-8')
timer.stop("\nTitle.akas.tsv importé.")

timer.start()
name_basics = pd.read_csv(DATASET_PATH + "name.basics.tsv", sep='\t', index_col='nconst', usecols=['nconst', 'primaryName', 'primaryProfession'], encoding='utf-8')
timer.stop("\nName.basics.tsv importé.")

# Nettoyage des datasets importés.
timer.start()
title_principals = title_principals.dropna()
title_principals = title_principals[['nconst']][title_principals['category'].str.contains("actor|actress", regex=True)]

title_ratings = title_ratings.dropna()

title_basics = title_basics[title_basics.genres != '\\N'].dropna()
title_basics = title_basics[title_basics.titleType.str.contains('movie|tvMovie', regex=True)]

title_akas = title_akas.dropna()
title_akas = title_akas.replace('\\N', np.nan)

name_basics = name_basics.dropna()
name_basics = name_basics[['primaryName']][name_basics.primaryProfession.str.contains('actor|actress', regex=True)]
timer.stop("\nNettoyage du dataset :")

# Création d'un dataframe comportant les régions originales des films (et export en CSV).
original_titles = title_akas[title_akas.isOriginalTitle==1]['title'].to_frame()
original_titles = original_titles.merge(title_akas[['title', 'region']], how="left", right_on=['titleId', 'title'], left_on=['titleId', 'title'])
original_titles = original_titles.dropna()
original_titles = original_titles.reset_index().drop_duplicates(subset="titleId", keep = "first").set_index('titleId')
original_titles.to_csv(CURATED_PATH + 'original_titles_regions.csv')

# Splits des datasets (utilisés dans les bonus):
data_loader.split_data_role()
data_loader.split_data_type()
data_loader.split_data_origine()


Chargement de title.principals.tsv :
Elapsed time: 56.8991 seconds
Chargement de title.ratings.tsv :
Elapsed time: 1.7872 seconds
Chargement de title.basics.tsv :
Elapsed time: 63.5592 seconds
Chargement title.akas.tsv :
Elapsed time: 36.2197 seconds
Chargement name.basics.tsv :
Elapsed time: 59.0370 seconds
Nettoyage du dataset :
Elapsed time: 156.6974 seconds


## 1. Sélection de la liste des acteurs par films

In [10]:
actors_movies = title_principals.merge(name_basics, right_index=True, left_on=['nconst'], how='left')
actors_movies = actors_movies.merge(title_basics[['primaryTitle']], right_index=True, left_on=['tconst'], how='left')
actors_movies = actors_movies.dropna()
actors_movies = actors_movies[['primaryTitle', 'primaryName']]
actors_movies = actors_movies.groupby('primaryTitle').agg({'primaryName': ', '.join})

# Export en CSV
actors_movies.to_csv(CURATED_PATH + 'actors_movies.csv')

## 2. La liste des films Américains (en gardant leur nom en français) et leur note moyenne

In [11]:
title_akas_fr = title_akas[title_akas['region'].str.contains('FR')].drop(columns=['region'])

american_movies = title_akas_fr.merge(title_basics[['originalTitle']], how='left', left_index=True, right_index=True)
american_movies = american_movies.dropna()

american_movies = american_movies.merge(title_ratings[['averageRating']], how='left', left_index=True, right_index=True)
american_movies = american_movies.dropna()

# Export en CSV :
american_movies.to_csv(CURATED_PATH + 'american_movies.csv', index=False)


## 3. La note moyenne des différents genres

In [14]:
temporary = title_basics[['genres']].merge(title_ratings, how='left', left_index=True, right_index=True)
temporary = temporary.dropna()
temporary = temporary[temporary.genres != '\\N']

mean_genre = pd.DataFrame(temporary.genres.str.split(',').tolist(), index=temporary.index).stack()
mean_genre = mean_genre.reset_index(['tconst', 0])

temporary = temporary.drop(columns=['genres'])

mean_genre = mean_genre.set_index('tconst')
mean_genre.columns = ['genre']

mean_genre = mean_genre.merge(temporary, how='left', left_index=True, right_index=True)
mean_genre = mean_genre[mean_genre.genre != '\\N']

mean_genre = mean_genre.groupby('genre').mean()

mean_genre = mean_genre.sort_values(by=['averageRating'], ascending=False)
mean_genre['averageRating'] = mean_genre['averageRating'].map('{:,.2f}'.format)

# Export en CSV :
mean_genre.to_csv(CURATED_PATH + 'mean_genre.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'CSV/mean_genre.csv'

## 4. La note moyenne de chaque acteur par rapport aux films dans lesquels il apparaît

In [13]:
actors_ranking = title_principals[['nconst']].merge(title_basics[['titleType']], left_index=True, right_index=True, how='left').drop(columns='titleType').dropna()
actors_ranking = actors_ranking.merge(name_basics, left_on='nconst', right_index=True, how='left').dropna()
actors_ranking = actors_ranking.merge(title_ratings, left_index=True, right_on=['tconst'], how='left')[['averageRating', 'primaryName']].dropna()
actors_ranking = actors_ranking.groupby('primaryName').agg({'averageRating': 'mean'})
actors_ranking['averageRating'] = actors_ranking['averageRating'].map('{:,.2f}'.format)

# Export en CSV :
actors_ranking.to_csv(CURATED_PATH + 'rating_actors.csv')

# Partie Bonus

## 3. Prediction des notes moyennes de films basées sur diverses paramètres

In [None]:
# Création d'un dataset comportant le scaractéristiques de tous les films avec leur note moyenne.

actors = pd.concat([pd.read_csv(CURATED_PATH + WORKS + 'actor.csv', delimiter=',', usecols=['tconst', 'nconst'], index_col = ['tconst']), pd.read_csv(CURATED_PATH + WORKS + 'actress.csv', delimiter=',', usecols=['tconst', 'nconst'], index_col = ['tconst'])])
actors = actors.rename(columns={"nconst": "actors"})

movies = pd.read_csv(CURATED_PATH + FORMATS + 'movie.csv', delimiter=',', usecols=['tconst', 'isAdult', 'startYear', 'runtimeMinutes', 'genres'], index_col = ['tconst'])
movies = movies[movies.runtimeMinutes != '\\N'][movies.genres != '\\N']

dataset = actors.merge(movies, how='right', left_index=True, right_index=True)
dataset = dataset.dropna()
dataset = dataset.assign(genres=dataset['genres'].str.split(',')).explode('genres')

producers = pd.read_csv(CURATED_PATH + WORKS + 'producer.csv', delimiter=',', usecols=['tconst', 'nconst'], index_col = ['tconst'])
producers = producers.rename(columns={"nconst": "producer"})

writers = pd.read_csv(CURATED_PATH + WORKS + 'writer.csv', delimiter=',', usecols=['tconst', 'nconst'], index_col = ['tconst'])
writers = writers.rename(columns={"nconst": "writer"})

composers = pd.read_csv(CURATED_PATH + WORKS + 'composer.csv', delimiter=',', usecols=['tconst', 'nconst'], index_col = ['tconst'])
composers = composers.rename(columns={"nconst": "composer"})

producers = producers.merge(writers, how="inner", left_index=True, right_index=True)
producers = producers.merge(composers, how="inner", left_index=True, right_index=True)
dataset = dataset.merge(producers, how="left", left_index=True, right_index=True)
dataset = dataset.dropna()

dataset = dataset.merge(original_titles[['region']], how="left", right_index=True, left_index=True)

rated_movies = title_ratings[title_ratings.numVotes  > nb_votes]
rated_movies = rated_movies[['averageRating']]
dataset = dataset.merge(rated_movies, how='left', left_index=True, right_index=True)
dataset = dataset.dropna()

# Export en CSV :
dataset.to_csv(CURATED_PATH + 'movies_infos.csv')