In [1]:
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)

In [2]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns
from scipy import stats

%matplotlib inline

In [None]:
df = pd.read_csv(filepath_or_buffer='./netflix.csv',sep=',',encoding='utf-8',index_col='show_id')
df.info(memory_usage=True,verbose=True)

In [None]:
na_duration_index = df.loc[df['duration'].isna()].index
print("Duration Column NaN value indexes",na_duration_index)

# ratning and duration value swaped
print(df.loc[na_duration_index].to_string())

df.loc[ na_duration_index , "duration" ] = df.loc[ na_duration_index , "rating" ]
df.loc[ na_duration_index , "rating" ]   = np.NaN  

# make it proper
print(df.loc[na_duration_index].to_string())

In [None]:
def tweak_netflix(df_:pd.DataFrame)->pd.DataFrame:
    maturity = pd.CategoricalDtype([
            'TV-Y',             # suitable for all child
            'TV-Y7',            # for child age 7 or older
            'TV-Y7-FV',         # for child age or older with Fancy Violence
            'TV-G',             # General Audience
            'G',                # Geneal Audience   # MOVIE
            'TV-PG',            # not suitable for younger child. 'parents guided" 
            'PG',               # some may not suitable for child. "parents guided"        #MOVIE
            'PG-13',            # parents storngly cautioned
            'TV-14',            # parents strongly cautioned. inappropriate for 14 yr child
            'TV-MA',            # Matured Audience
            'NC-17',            # no one 17 and under admitted 
            'R',                # restricted
            'NR',               # not rated
            'UR'                # Unrated
        ],ordered=True)
    return(
        df_.assign(
            release_year = df_['release_year'].astype('int64'),
            rating       = df_['rating'].astype(maturity),
            type = df_['type'].astype('category'),
            date_added = pd.to_datetime(df_['date_added'],errors='coerce'),
            category = df_['duration'].str.split(' ',expand=True)[1].astype('category'),
            duration = df_['duration'].str.split(' ',expand=True)[1].astype('category'),
            director = df_['director'].astype('str')
        ).drop_duplicates()
    )

In [None]:
dd = tweak_netflix(df)
dd.info(memory_usage=True)