# Working with 'Data Types' and NA Values

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [7]:
titanic = pd.read_csv("titanic.csv")

In [3]:
# To convert fare to float64 type
titanic["fare"].value_counts()

8.05       60
13         59
7.75       55
26         50
7.8958     49
           ..
15.05       1
14          1
15.5792     1
12          1
7.875       1
Name: fare, Length: 282, dtype: int64

In [9]:
titanic["fare"] = titanic["fare"].replace(["?"],np.nan)

## dropna=True

In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pclass     1309 non-null   int64 
 1   survived   1309 non-null   int64 
 2   name       1309 non-null   object
 3   sex        1309 non-null   object
 4   age        1309 non-null   object
 5   sibsp      1309 non-null   int64 
 6   parch      1309 non-null   int64 
 7   ticket     1309 non-null   object
 8   fare       1309 non-null   object
 9   cabin      1309 non-null   object
 10  embarked   1309 non-null   object
 11  boat       1309 non-null   object
 12  body       1309 non-null   object
 13  home.dest  1309 non-null   object
dtypes: int64(4), object(10)
memory usage: 143.3+ KB


In [10]:
titanic["fare"].astype("float")

0       211.3375
1       151.5500
2       151.5500
3       151.5500
4       151.5500
          ...   
1304     14.4542
1305     14.4542
1306      7.2250
1307      7.2250
1308      7.8750
Name: fare, Length: 1309, dtype: float64

In [11]:
len(titanic["fare"])

1309

## Categorical Type

In [16]:
titanic["sex"] = titanic["sex"].astype("category")
titanic.info()
# Now compare the memory usage from above 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   int64   
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1309 non-null   object  
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   object  
 9   cabin      1309 non-null   object  
 10  embarked   1309 non-null   object  
 11  boat       1309 non-null   object  
 12  body       1309 non-null   object  
 13  home.dest  1309 non-null   object  
dtypes: category(1), int64(4), object(9)
memory usage: 134.5+ KB


## Casting pd.to_numeric()

In [17]:
# To convert 'age' column from object to float
titanic["age"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: age
Non-Null Count  Dtype 
--------------  ----- 
1309 non-null   object
dtypes: object(1)
memory usage: 10.4+ KB


In [18]:
pd.to_numeric(titanic["age"])
# This shows an error, but we can coerce value if any error occurs.
# i.e if any error occur, convert any value other than numeric to NaN.

ValueError: Unable to parse string "?" at position 15

In [19]:
pd.to_numeric(titanic["age"],errors="coerce")

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
         ...   
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, Length: 1309, dtype: float64

## isna() and dropna()

### isna()

In [22]:
sales = pd.read_csv("sales.csv")
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,,92660.0
5,,,
6,,60007.0,60007.0


In [23]:
sales.isna()

Unnamed: 0,rating,shipping_zip,billing_zip
0,False,True,False
1,False,False,False
2,True,False,False
3,False,False,False
4,False,True,False
5,True,True,True
6,True,False,False


In [24]:
sales["shipping_zip"].isna()

0     True
1    False
2    False
3    False
4     True
5     True
6    False
Name: shipping_zip, dtype: bool

In [25]:
sales[sales["shipping_zip"].isna()]

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
4,4.0,,92660.0
5,,,


### dropna()

In [26]:
sales.dropna()

Unnamed: 0,rating,shipping_zip,billing_zip
1,4.5,94931.0,94931.0
3,4.5,10003.0,10003.0


In [27]:
sales["billing_zip"].dropna()

0    81220.0
1    94931.0
2    92625.0
3    10003.0
4    92660.0
6    60007.0
Name: billing_zip, dtype: float64

## how(any,all)

### how="all"

drops row if all values are null.

### how="any"

In [None]:
drops row if any values are null

In [28]:
sales.dropna(how="all")
# Drops the final row where all values are null

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,,92660.0
6,,60007.0,60007.0


In [29]:
sales.dropna(how="any")

Unnamed: 0,rating,shipping_zip,billing_zip
1,4.5,94931.0,94931.0
3,4.5,10003.0,10003.0


## subset=["column"]

In [30]:
sales.dropna(subset=["shipping_zip"])
# Drops every row inside points columns with value NaN

Unnamed: 0,rating,shipping_zip,billing_zip
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
6,,60007.0,60007.0


## fillna()

In [31]:
sales.fillna(0)

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,0.0,81220.0
1,4.5,94931.0,94931.0
2,0.0,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,0.0,92660.0
5,0.0,0.0,0.0
6,0.0,60007.0,60007.0


In [32]:
sales["shipping_zip"].fillna("X")
# This prints a series
# inplace=True to set it permanent.

0          X
1    94931.0
2    92625.0
3    10003.0
4          X
5          X
6    60007.0
Name: shipping_zip, dtype: object

In [36]:
sales["shipping_zip"].fillna("None",inplace=True)

In [38]:
sales["shipping_zip"]

0       None
1    94931.0
2    92625.0
3    10003.0
4       None
5       None
6    60007.0
Name: shipping_zip, dtype: object

In [40]:
sales["shipping_zip"].fillna(sales["billing_zip"],inplace=True)

In [41]:
sales

Unnamed: 0,rating,shipping_zip,billing_zip
0,5.0,,81220.0
1,4.5,94931.0,94931.0
2,,92625.0,92625.0
3,4.5,10003.0,10003.0
4,4.0,,92660.0
5,,,
6,,60007.0,60007.0
