### Netflix movies and TV shows

##### Libraries and notebook customization

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_rows', 25) # show maximum 25 rows
pd.set_option('display.max_columns', 25) # show maximum 10 columns
pd.set_option('display.max_colwidth', 50) # show maximum 15 characters in each column
pd.set_option('display.width', 150) # Mostrar 150 caracteres como máximo

##### Dataset paths

In [None]:
data_file_path = 'DataSets/netflix_titles.csv'

##### Functions

In [None]:
def detect_implicit_duplicates(df, include=None, exclude=None):
    
    if exclude is None:
        exclude = []
    
    if include is None:
        available_columns = [col for col in df.columns if col not in exclude]
    else:
        available_columns = [col for col in include if col not in exclude]
    
    for column in available_columns:
        
        if column in exclude:
            continue  
        
        if df[column].dtype != 'object':
            continue
        
        column_values = df[column]
        column_values = column_values[column_values != '']
        column_values = column_values[column_values.apply(lambda x: len(re.split(r"[ \-\_']", x)) == 1)]

        # 2. Get base unique values
        base_unique_values = column_values.unique().tolist()
        if not base_unique_values:
            continue

        print(f"\nColumn: '{column}'")

        # 3. Compare base unique values against all Column's values
        for base in tqdm(base_unique_values, desc=f"Searching implicit values for: '{column}'"):
            pattern = re.compile(re.escape(base), re.IGNORECASE)

            matches = [val for val in column_values if val != base and pattern.search(val)]

            if matches:
                print(f"  '{base}' → {matches}")

DataFrame features

In [None]:
df_netflix = pd.read_csv(data_file_path, sep=',', header='infer', keep_default_na=False)

df_netflix.info()
print()
print("DataFrame sample: \n", df_netflix.sample(5))

##### Explicit duplicates

In [None]:
print("Explicit duplicated values: ", df_netflix.duplicated().sum())


##### String format normalization

In [None]:
for column in df_netflix.columns:
    
    if column == 'release_year':
        continue
    elif column == 'date_added':
        df_netflix[column] = df_netflix[column].str.lower()
        df_netflix[column] = df_netflix[column].str.strip()
        
    elif df_netflix[column].astype(str).str.contains(", ").any():
        df_netflix.loc[df_netflix[column].astype(str).str.contains(", ", na=False), column] = (df_netflix[column].str.lower())
        df_netflix.loc[df_netflix[column].astype(str).str.contains(", ", na=False), column] = (df_netflix[column].str.strip())
        
        df_netflix.loc[~df_netflix[column].astype(str).str.contains(", ", na=False), column] = (df_netflix[column].str.lower())
        df_netflix.loc[~df_netflix[column].astype(str).str.contains(", ", na=False), column] = (df_netflix[column].str.strip())
        df_netflix.loc[~df_netflix[column].astype(str).str.contains(", ", na=False), column] = (df_netflix[column].str.replace(" ", "_"))
        
    else:
        df_netflix[column] = df_netflix[column].str.lower()
        df_netflix[column] = df_netflix[column].str.strip()
        df_netflix[column] = df_netflix[column].str.replace(' ', '_')


In [None]:
print(df_netflix.sample(25, random_state=333))

##### Date-time format normalization

In [None]:
df_netflix['date_added'] = pd.to_datetime(df_netflix['date_added'], format='%B %d, %Y', errors='coerce')


In [None]:
print(df_netflix['date_added'].sample(25, random_state=333))

##### Cells with list values processing

In [None]:
columns_to_explode = []

for column in df_netflix.columns:
    if df_netflix[column].astype(str).str.contains(', ').any():
        columns_to_explode.append(column)

print(f"Columns to be processed with explode: {columns_to_explode}")


In [None]:
columns_to_explode = ['director', 'cast', 'country', 'listed_in']

for column in columns_to_explode:
    df_netflix[column] = df_netflix[column].astype(str).str.split(', ')
    df_netflix = df_netflix.explode(column).reset_index(drop=True)

df_netflix.info()
print()
print(df_netflix.sample(25))

##### Missing values processing

In [None]:
missing_values = ['', ' ', 'N/A', 'None', 'null', 'NULL', 'NaN', 'nan', 'NAN']

for column in df_netflix.columns:
    
    if df_netflix[column].isin(missing_values).any():
        value_counts = df_netflix[column].value_counts()
        missing_counts = value_counts[value_counts.index.isin(missing_values)]
        other_counts = value_counts[~value_counts.index.isin(missing_values)]
    
        sorted_counts = pd.concat([missing_counts, other_counts])
    
        print(f"Values on column '{column}': \n{sorted_counts}\n")


In [None]:
for column in df_netflix.columns:
    
    if df_netflix[column].isin(missing_values).any():
        df_netflix[column] = df_netflix[column].replace(missing_values, pd.NA)

In [None]:
for column in df_netflix.columns:
    
    if df_netflix[column].isna().any():
        value_counts = df_netflix[column].value_counts(dropna=False)
        value_counts = value_counts.reset_index()
        value_counts = value_counts.sort_values(by=column, na_position='first')
    
        print(f"Values on column '{column}': \n{value_counts}\n")

from which country most of Netflix' content comes from

In [None]:
print(df_netflix['country'].value_counts(dropna=False))
      