In [83]:
import pandas as pd
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import CountVectorizer
from module import NlpSpacy
nlp = NlpSpacy()

In [48]:
basics = pd.read_csv(
    "data/title.basics.tsv",
    sep="\t", low_memory=False,
    index_col='tconst',
    na_values=['\\N']
)

In [49]:
# keep no-adult movies
movies = basics[(basics.isAdult == 0) & (basics.titleType == "movie")].copy()

In [50]:
# drop useless columns
movies.drop(
    columns=["titleType","isAdult","endYear","primaryTitle"],
    inplace=True
)

In [51]:
ratings = pd.read_csv(
    "data/title.ratings.tsv",
    sep="\t", low_memory=False,
    index_col='tconst',
    na_values=['\\N']
)

In [52]:
# merge ratings to movies
movies = pd.merge(
    left=movies,
    right=ratings,
    left_index=True,
    right_index=True,
    how='left'
)

In [53]:
# keep popular movies ( popular = numVote > 800 )
movies = movies[movies.numVotes>800]

In [54]:
# data OMDbAPI (plot, country & poster )
request_api = pd.read_csv("data/request_api.csv", index_col='tconst')


In [55]:
movies = pd.merge(
    left=movies,
    right=request_api,
    left_index=True,
    right_index=True,
    how='left'
)

In [56]:
movies.country.fillna("unknow", inplace=True)

In [57]:
# Filter films by popularity in relation to country of origin
filter_usa = (movies.country.str.contains('United States')) & (movies.numVotes > 90000)
filter_uk = (movies.country.str.contains('United Kingdom')) & (movies.numVotes > 90000)
filter_fr = (movies.country.str.contains('France')) & (movies.numVotes > 3000)
filter_india = (movies.country.str.contains('United States')) & (movies.numVotes > 17000)

filter_other = (~movies.country.str.contains('United States|United Kingdom|France|India')) & (movies.numVotes > 9000)

In [58]:
movies = movies[filter_usa | filter_uk | filter_fr | filter_india | filter_other]

In [59]:
movies['startYear'] = movies.startYear.astype(int)
movies['runtimeMinutes'] = movies.runtimeMinutes.astype(int)
movies['numVotes'] = movies.runtimeMinutes.astype(int)

In [60]:
principals = pd.read_csv(
    "data/title.principals.tsv",
    sep="\t", low_memory=False,
    na_values=['\\N'],
    usecols=["tconst","nconst","category"]
)

In [61]:
#keep movies selected
principals = principals[principals.tconst.isin(movies.index)]

In [62]:
name_basics = pd.read_csv(
    "data/name.basics.tsv",
    sep="\t", low_memory=False,
    na_values=['\\N'],
    usecols=["nconst","primaryName"]
)

In [63]:
castings = pd.merge(
    left=principals,
    right=name_basics,
    on='nconst',
    how='left'
).set_index('tconst')

In [64]:
castings["counter"] = 0
castings = castings.reset_index().merge(castings.groupby("primaryName").counter.count(), on='primaryName').drop(columns='counter_x').rename(columns={'counter_y':'n_movies'}).set_index("tconst")

In [65]:
castings = castings[castings.n_movies>2]

In [66]:
infos_movies = pd.merge(
    left=movies,
    right=castings.pivot_table('primaryName','tconst','category',aggfunc=lambda x : ",".join(x).split(',')),
    left_index=True,
    right_index=True,
    how='left'
)

In [67]:
infos_movies.to_pickle("data/infos_movies.p")

In [68]:
dummies = pd.concat(
    [
        pd.get_dummies(movies['genres'].str.split(',').explode(), prefix='Genre').groupby("tconst").sum(),
        pd.get_dummies(movies['country'].str.split(',').explode(), prefix='Country').groupby("tconst").sum(),
        pd.get_dummies(castings['primaryName'], columns=['primaryName'], prefix='Casting').groupby('tconst').sum().astype(bool)
    ],
    axis=1
)

In [250]:
data_encoded = pd.merge(
    left=movies[['startYear','runtimeMinutes','averageRating','numVotes']],
    right=dummies,
    left_index=True,
    right_index=True,
    how='left'
)

In [251]:
scaler = RobustScaler()
data_encoded[['startYear','runtimeMinutes','averageRating','numVotes']] = scaler.fit_transform(data_encoded[['startYear','runtimeMinutes','averageRating','numVotes']])

In [252]:
data_encoded.fillna(False, inplace=True)

In [253]:
token_plot = infos_movies['plot'].apply(lambda sentence : ",".join(nlp.token(sentence)))

In [254]:
cv=CountVectorizer(
    ngram_range= (1, 3),
    max_df= 0.01,
    min_df=2
)

In [255]:
sparse = cv.fit_transform(token_plot)

In [257]:
data_encoded = pd.merge(
    left=data_encoded,
    right=pd.DataFrame.sparse.from_spmatrix(sparse,index=token_plot.index,columns="Nlp_"+cv.get_feature_names_out()),
    left_index=True,
    right_index=True,
    how='left'
)

In [259]:
data_encoded.to_pickle("data/data_encoded.p")