In [28]:
import pandas as pd

In [29]:
df_netflix = pd.read_csv('./datasets/netflix_daily_top_10.csv')

In [30]:
df_netflix.head(5)

Unnamed: 0,As of,Rank,Year to Date Rank,Last Week Rank,Title,Type,Netflix Exclusive,Netflix Release Date,Days In Top 10,Viewership Score
0,2020-04-01,1,1,1,"Tiger King: Murder, Mayhem …",TV Show,Yes,"Mar 20, 2020",9,90
1,2020-04-01,2,2,-,Ozark,TV Show,Yes,"Jul 21, 2017",5,45
2,2020-04-01,3,3,2,All American,TV Show,,"Mar 28, 2019",9,76
3,2020-04-01,4,4,-,Blood Father,Movie,,"Mar 26, 2020",5,30
4,2020-04-01,5,5,4,The Platform,Movie,Yes,"Mar 20, 2020",9,55


## Dessa base, analise e extraia:

- Tipos de dados disponíveis [ x ]
- Período da análise feita   [ x ]
- Tamanho da base de dados   [ x ]
- Verificar dados nulos      [ x ]
- Outliers                   [ x ]

### Converter dados

In [31]:
# Converter Year to Date Rank e Last Week Rank para numérico, substituindo '-' por NaN
df_netflix['Year to Date Rank'] = pd.to_numeric(df_netflix['Year to Date Rank'], errors='coerce')
df_netflix['Last Week Rank'] = pd.to_numeric(df_netflix['Last Week Rank'], errors='coerce')
# Converter colunas de data para datetime
df_netflix['As of'] = pd.to_datetime(df_netflix['As of'])
df_netflix['Netflix Release Date'] = pd.to_datetime(df_netflix['Netflix Release Date'])

### Tipos de dados disponiveis

In [52]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7100 entries, 0 to 7099
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   As of                 7100 non-null   datetime64[ns]
 1   Rank                  7100 non-null   int64         
 2   Year to Date Rank     7100 non-null   float64       
 3   Last Week Rank        7100 non-null   float64       
 4   Title                 7100 non-null   object        
 5   Type                  7100 non-null   object        
 6   Netflix Exclusive     7100 non-null   object        
 7   Netflix Release Date  7100 non-null   datetime64[ns]
 8   Days In Top 10        7100 non-null   int64         
 9   Viewership Score      7100 non-null   int64         
dtypes: datetime64[ns](2), float64(2), int64(3), object(3)
memory usage: 554.8+ KB


### Tamanho da base de dados

In [32]:
len(df_netflix)

7100

### Verificar dados nulos

In [33]:
df_netflix.isna().sum()

As of                      0
Rank                       0
Year to Date Rank        859
Last Week Rank          3968
Title                      0
Type                       0
Netflix Exclusive       2501
Netflix Release Date       0
Days In Top 10             0
Viewership Score           0
dtype: int64

In [34]:
# Verificar valores de Year to Date Rank
df_netflix['Year to Date Rank'].value_counts()

Year to Date Rank
1.0     708
2.0     708
3.0     706
4.0     705
5.0     699
6.0     675
7.0     633
8.0     554
9.0     481
10.0    372
Name: count, dtype: int64

In [63]:
# Tranformar todos os nulos de Year to Date Rank em 0
df_netflix.fillna({'Year to Date Rank': 0}, inplace=True)

In [40]:
# Verificar valores de Last Week Rank
df_netflix['Last Week Rank'].value_counts()

Last Week Rank
1.0     653
2.0     564
3.0     452
4.0     336
5.0     282
6.0     240
7.0     193
8.0     167
9.0     133
10.0    112
Name: count, dtype: int64

In [64]:
# Tranformar todos os nulos de Last Week Rank em 0
df_netflix.fillna({'Last Week Rank': 0}, inplace=True)


In [49]:
# Verificar valores de Netflix Exclusive  
df_netflix['Netflix Exclusive'].value_counts()

Netflix Exclusive
Yes        4599
Unknown    2501
Name: count, dtype: int64

In [65]:
# Tranformar todos os nulos de Netflix Exclusive em Unknown
df_netflix.fillna({'Netflix Exclusive': 0}, inplace=True)


In [47]:
# Verificar dados nulos novamente
df_netflix.isna().sum()

As of                   0
Rank                    0
Year to Date Rank       0
Last Week Rank          0
Title                   0
Type                    0
Netflix Exclusive       0
Netflix Release Date    0
Days In Top 10          0
Viewership Score        0
dtype: int64

In [51]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7100 entries, 0 to 7099
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   As of                 7100 non-null   datetime64[ns]
 1   Rank                  7100 non-null   int64         
 2   Year to Date Rank     7100 non-null   float64       
 3   Last Week Rank        7100 non-null   float64       
 4   Title                 7100 non-null   object        
 5   Type                  7100 non-null   object        
 6   Netflix Exclusive     7100 non-null   object        
 7   Netflix Release Date  7100 non-null   datetime64[ns]
 8   Days In Top 10        7100 non-null   int64         
 9   Viewership Score      7100 non-null   int64         
dtypes: datetime64[ns](2), float64(2), int64(3), object(3)
memory usage: 554.8+ KB


### Período da análise feita

In [53]:
# Verificar o período da análise
start_date = df_netflix['As of'].min()
end_date = df_netflix['As of'].max()

print(f"O período da análise vai de {start_date} até {end_date}.")


O período da análise vai de 2020-04-01 00:00:00 até 2022-03-11 00:00:00.


### Outliers

In [56]:
# Função para detectar outliers usando o IQR
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

In [57]:
# Detectar outliers na coluna 'Rank'
outliers_rank = detect_outliers_iqr(df_netflix, 'Rank')
print(outliers_rank)

Empty DataFrame
Columns: [As of, Rank, Year to Date Rank, Last Week Rank, Title, Type, Netflix Exclusive, Netflix Release Date, Days In Top 10, Viewership Score]
Index: []


In [58]:
# Detectar outliers na coluna 'Year to Date Rank'
outliers_year_to_date_rank = detect_outliers_iqr(df_netflix, 'Year to Date Rank')
print(outliers_year_to_date_rank)

Empty DataFrame
Columns: [As of, Rank, Year to Date Rank, Last Week Rank, Title, Type, Netflix Exclusive, Netflix Release Date, Days In Top 10, Viewership Score]
Index: []


In [59]:
# Detectar outliers na coluna 'Days In Top 10'
outliers_days_in_top_10 = detect_outliers_iqr(df_netflix, 'Days In Top 10')
print(outliers_days_in_top_10)

          As of  Rank  Year to Date Rank  Last Week Rank          Title  \
438  2020-05-14     9                9.0             0.0          Ozark   
447  2020-05-15     8                9.0            10.0          Ozark   
458  2020-05-16     9                8.0             0.0          Ozark   
489  2020-05-19    10                0.0             7.0          Ozark   
629  2020-06-02    10                0.0             0.0    Outer Banks   
...         ...   ...                ...             ...            ...   
7053 2022-03-07     4                3.0             4.0  Love is Blind   
7063 2022-03-08     4                4.0             4.0  Love is Blind   
7074 2022-03-09     5                4.0             5.0  Love is Blind   
7085 2022-03-10     6                5.0             5.0  Love is Blind   
7094 2022-03-11     5                6.0             6.0  Love is Blind   

         Type Netflix Exclusive Netflix Release Date  Days In Top 10  \
438   TV Show              

In [60]:
# Detectar outliers na coluna 'Viewership Score'
outliers_viewership_score = detect_outliers_iqr(df_netflix, 'Viewership Score')
print(outliers_viewership_score)

          As of  Rank  Year to Date Rank  Last Week Rank  \
236  2020-04-24     7                7.0             2.0   
247  2020-04-25     8                7.0             2.0   
255  2020-04-26     6                8.0             3.0   
269  2020-04-27    10                6.0             4.0   
2382 2020-11-25     3                2.0             3.0   
...         ...   ...                ...             ...   
7008 2022-03-02     9                6.0             3.0   
7063 2022-03-08     4                4.0             4.0   
7074 2022-03-09     5                4.0             5.0   
7085 2022-03-10     6                5.0             5.0   
7094 2022-03-11     5                6.0             6.0   

                             Title     Type Netflix Exclusive  \
236   Tiger King: Murder, Mayhem …  TV Show               Yes   
247   Tiger King: Murder, Mayhem …  TV Show               Yes   
255   Tiger King: Murder, Mayhem …  TV Show               Yes   
269   Tiger King: M

#### Usando o IQR foram encontrados outliers em Days In Top 10 e Viewership Score