# Projet Intermovie

Ce Notebook a pour but d'analyser un dataset de films dans le but de récupérer plusieurs informations :

- La liste des acteurs par film.

- La liste des films Américains (en gardant leur nom en français) et leur note moyenne.

- Les notes moyennes des différents genres.    

- La note moyenne de chaque acteur par rapport aux films dans lesquels il apparaît.


## Structure des fichiers tsv

- name.basics :         nconst / primaryName / birthYear / deathYear / primaryProfession / knownForTitles
- title.akas :          titleId / ordering / title / region / language / types / attributes / isOriginalTitle
- title.basics :        tconst / titleType / primaryTitle / originalTitle / isAdult / startYear / endYear / runtimeMinutes / genres
- title.principals :    tconst / ordering / nconst / category / job / characters
- title.ratings :       tconst / averageRating / numVotes





In [None]:
# Extension IPython rechargeant les modules avant que l'utilisateur saisisse du code.
%load_ext autoreload
%autoreload 2

# Import des librairies.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from src.credentials import Credentials as cr
import src.split_datas as sd

# 1- Liste des acteurs par film

In [None]:
sd.split_datas(cr.TITLE_BASICS, 'titleType')

In [None]:
global_title_basics = pd.read_csv("./data/CURATED/titleType/movie.csv", usecols = ['tconst', 'originalTitle'])
global_title_basics = global_title_basics.dropna(axis = 0)
global_title_basics = global_title_basics.drop_duplicates()
global_title_basics.describe()

In [None]:
sd.split_datas(cr.TITLE_PRINCIPALS, 'category')

In [None]:
global_actor = pd.read_csv("./data/CURATED/category/actor.csv", usecols = ['tconst', 'nconst'])
global_actress = pd.read_csv("./data/CURATED/category/actress.csv", usecols = ['tconst', 'nconst'])
global_self = pd.read_csv("./data/CURATED/category/self.csv", usecols = ['tconst', 'nconst'])
global_title_principals = pd.concat([global_actor, global_actress, global_self])
del global_actor
del global_actress
del global_self
global_title_principals = global_title_principals.dropna(axis = 0)
global_title_principals = global_title_principals.drop_duplicates()
global_title_principals.describe()

In [None]:
global_name_basics = pd.read_csv("./data/RAW/name.basics.tsv", usecols = ['nconst', 'primaryName'], delimiter = '\t')
global_name_basics = global_name_basics.dropna(axis = 0)
global_name_basics = global_name_basics.drop_duplicates()
global_name_basics.describe()

In [None]:
local_request_1 = global_title_principals.merge(global_title_basics, how = 'left', on = 'tconst')
local_request_1 = local_request_1.merge(global_name_basics, how = 'left', on = 'nconst')
del global_title_basics
del global_title_principals
del global_name_basics
local_request_1 = local_request_1.drop(['tconst', 'nconst'], axis = 1)
local_request_1 = local_request_1.dropna(axis = 0)
local_request_1_final = local_request_1.groupby('originalTitle').agg({'primaryName': ','.join}, axis = 0)
del local_request_1

In [None]:
local_request_1_final.to_csv('./data/REQUESTS/request_1.csv')
del local_request_1_final

# 2- Liste des films Américains (en gardant leur nom en français) et leur note moyenne

In [None]:
sd.split_datas(cr.TITLE_AKAS, 'region')

In [None]:
global_title_akas = pd.read_csv("./data/CURATED/region/US.csv", usecols = ['titleId'])
global_title_akas = global_title_akas.dropna(axis = 0)
global_title_akas = global_title_akas.drop_duplicates()
global_title_akas = global_title_akas.rename(columns = {'titleId' : 'tconst'})
global_title_akas.describe()

In [None]:
global_title_basics = pd.read_csv("./data/CURATED/titleType/movie.csv", usecols = ['tconst', 'originalTitle'])
global_title_basics = global_title_basics.dropna(axis = 0)
global_title_basics = global_title_basics.drop_duplicates()
global_title_basics.describe()

In [None]:
global_title_ratings = pd.read_csv("./data/RAW/title.ratings.tsv", usecols = ['tconst', 'averageRating'], delimiter = '\t')
global_title_ratings = global_title_ratings.dropna(axis=0)
global_title_ratings = global_title_ratings.drop_duplicates()
global_title_ratings.describe()

In [None]:
local_request_2 = global_title_akas.merge(global_title_basics, how = 'left', on = 'tconst')
local_request_2 = local_request_2.merge(global_title_ratings, how = 'left', on = 'tconst')
del global_title_akas
del global_title_basics
del global_title_ratings
local_request_2 = local_request_2.drop(['tconst'], axis = 1)
local_request_2 = local_request_2.dropna(axis = 0)
local_request_2 = local_request_2.drop_duplicates()

In [None]:
local_request_2.to_csv('./data/REQUESTS/request_2.csv')

In [None]:
mean_averageRating = local_request_2['averageRating'].mean()
print(mean_averageRating)
del local_request_2

# 3- Les notes moyennes des différents genres

In [None]:
global_title_basics = pd.read_csv("./data/RAW/title.basics.tsv", usecols = ['tconst', 'genres'], delimiter = '\t')
global_title_basics_split = global_title_basics['genres'].str.split(",", expand = True)
global_title_basics_split = global_title_basics_split.join(global_title_basics).drop(['genres'], axis = 1)
del global_title_basics
global_title_ratings = pd.read_csv("./data/RAW/title.ratings.tsv", usecols = ['tconst', 'averageRating'], delimiter = '\t')
local_request_3 = global_title_basics_split.merge(global_title_ratings, how = 'left', on = 'tconst')

In [None]:
local_mean_1 = local_request_3.groupby([0])['averageRating'].mean()
local_mean_2 = local_request_3.groupby([1])['averageRating'].mean()
local_mean_3 = local_request_3.groupby([2])['averageRating'].mean()
local_mean = pd.concat([local_mean_1, local_mean_2, local_mean_3], axis = 1, keys = ["mean1", "mean2", "mean3"])
del global_title_basics_split
del global_title_ratings
del local_mean_1
del local_mean_2
del local_mean_3
local_mean['mean'] = local_mean[['mean1', 'mean2', 'mean3']].mean(axis = 1)
local_request_3 = local_mean.drop(['mean1', 'mean2', 'mean3'], axis = 1)
del local_mean

In [None]:
local_request_3.to_csv('./data/REQUESTS/request_3.csv')