In [1]:
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)

In [2]:
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt 
import seaborn as sns
from scipy import stats

%matplotlib inline

In [3]:
df = pd.read_csv(filepath_or_buffer='./netflix.csv',sep=',',encoding='utf-8',index_col='show_id')
df.info(memory_usage=True,verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, s1 to s8807
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8807 non-null   object
 1   title         8807 non-null   object
 2   director      6173 non-null   object
 3   cast          7982 non-null   object
 4   country       7976 non-null   object
 5   date_added    8797 non-null   object
 6   release_year  8807 non-null   int64 
 7   rating        8803 non-null   object
 8   duration      8804 non-null   object
 9   listed_in     8807 non-null   object
 10  description   8807 non-null   object
dtypes: int64(1), object(10)
memory usage: 825.7+ KB


In [4]:
na_duration_index = df.loc[df['duration'].isna()].index
print("Duration Column NaN value indexes",na_duration_index)

# ratning and duration value swaped
print(df.loc[na_duration_index].to_string())

df.loc[ na_duration_index , "duration" ] = df.loc[ na_duration_index , "rating" ]
df.loc[ na_duration_index , "rating" ]   = np.NaN  

# make it proper
print(df.loc[na_duration_index].to_string())

Duration Column NaN value indexes Index(['s5542', 's5795', 's5814'], dtype='object', name='show_id')
          type                                 title    director        cast        country          date_added  release_year  rating duration listed_in                                                                                                                                           description
show_id                                                                                                                                                                                                                                                                                                       
s5542    Movie                       Louis C.K. 2017  Louis C.K.  Louis C.K.  United States       April 4, 2017          2017  74 min      NaN    Movies            Louis C.K. muses on religion, eternal love, giving dogs drugs, email fights, teachers and more in a live performance from Washing

In [5]:
def tweak_netflix(df_:pd.DataFrame)->pd.DataFrame:
    maturity = pd.CategoricalDtype([
            'TV-Y',             # suitable for all child
            'TV-Y7',            # for child age 7 or older
            'TV-Y7-FV',         # for child age or older with Fancy Violence
            'TV-G',             # General Audience
            'G',                # Geneal Audience   # MOVIE
            'TV-PG',            # not suitable for younger child. 'parents guided" 
            'PG',               # some may not suitable for child. "parents guided"        #MOVIE
            'PG-13',            # parents storngly cautioned
            'TV-14',            # parents strongly cautioned. inappropriate for 14 yr child
            'TV-MA',            # Matured Audience
            'NC-17',            # no one 17 and under admitted 
            'R',                # restricted
            'NR',               # not rated
            'UR'                # Unrated
        ],ordered=True)
    return(
        df_.assign(
            release_year = df_['release_year'].astype('int64'),
            rating       = df_['rating'].astype(maturity),
            type = df_['type'].astype('category'),
            date_added = pd.to_datetime(df_['date_added'],errors='coerce'),
            category = df_['duration'].str.split(' ',expand=True)[1].astype('category'),
            duration = df_['duration'].str.split(' ',expand=True)[1].astype('category'),
            director = df_['director'].astype('str')
        ).drop_duplicates()
    )

In [6]:
dd = tweak_netflix(df)
dd.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Index: 8807 entries, s1 to s8807
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   type          8807 non-null   category      
 1   title         8807 non-null   object        
 2   director      8807 non-null   object        
 3   cast          7982 non-null   object        
 4   country       7976 non-null   object        
 5   date_added    8709 non-null   datetime64[ns]
 6   release_year  8807 non-null   int64         
 7   rating        8800 non-null   category      
 8   duration      8807 non-null   category      
 9   listed_in     8807 non-null   object        
 10  description   8807 non-null   object        
 11  category      8807 non-null   category      
dtypes: category(4), datetime64[ns](1), int64(1), object(6)
memory usage: 912.7+ KB
