In [8]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [9]:
# reading csv data
sample_data = pd.read_csv('sample_data/sample.csv')

In [10]:
sample_data

Unnamed: 0,A,B,C
0,-0.166919,0.979728,-0.632955
1,-0.297953,,-1.365463
2,-0.120211,-0.540679,
3,,-2.027325,1.533582
4,,,0.461821
5,-0.788073,,
6,-0.91608,-0.612343,
7,-0.887858,1.033826,
8,1.94843,1.025011,-2.982224
9,0.019698,-0.795876,-0.046431


In [11]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 A    10 non-null object
B     9 non-null object
C     9 non-null object
dtypes: object(3)
memory usage: 368.0+ bytes


## Tool box for checking data sanity

### check data type

* Convert data type if needed
    * NaN in object data type is not considered as NaN data - cause problem when handling missing values
    * Also can use dtype attributes in ```read_csv( )``` 

In [12]:
def change_dtypes(col_obj, df): # convert object dtype to float dtype
    df[col_obj] = df[col_obj].astype('float')

In [13]:
for each in sample_data.columns:
    change_dtypes(each, sample_data)

In [14]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 A    8 non-null float64
B     7 non-null float64
C     6 non-null float64
dtypes: float64(3)
memory usage: 368.0 bytes


### check missing data

In [15]:
def check_missing_data(df):
    # sort based on number of EMPTY cell (cells of 'NaN' are not included)
    return df.isnull().sum().sort_values(ascending=False)

In [16]:
check_missing_data(sample_data)

C     4
B     3
 A    2
dtype: int64

### How to handle missing values?
- Discard them
   - tradeoffs
       - losing information
       - creating bias
  
- Substitue missing values with something else (mean or 0)
    - tradeoffs
        - modified the variances
- Substitue missing values with approximate values (using regression or EM)

Discard missing values

In [17]:
# drop NaN values - only deletes missing vlaues
sample_data2 = sample_data.dropna()

In [18]:
sample_data2

Unnamed: 0,A,B,C
0,-0.166919,0.979728,-0.632955
8,1.94843,1.025011,-2.982224
9,0.019698,-0.795876,-0.046431


In [28]:
sample_data2.reset_index(drop=True)  #drop = True will remove the previous index

Unnamed: 0,A,B,C
0,-0.166919,0.979728,-0.632955
1,1.94843,1.025011,-2.982224
2,0.019698,-0.795876,-0.046431


Replace missing value by the mean and 0

In [29]:
sample_data3 = sample_data.fillna(sample_data.mean())
sample_data3

Unnamed: 0,A,B,C
0,-0.166919,0.979728,-0.632955
1,-0.297953,-0.133951,-1.365463
2,-0.120211,-0.540679,-0.505278
3,-0.151121,-2.027325,1.533582
4,-0.151121,-0.133951,0.461821
5,-0.788073,-0.133951,-0.505278
6,-0.91608,-0.612343,-0.505278
7,-0.887858,1.033826,-0.505278
8,1.94843,1.025011,-2.982224
9,0.019698,-0.795876,-0.046431


In [31]:
sample_data4 = sample_data.fillna(0) 
sample_data4

Unnamed: 0,A,B,C
0,-0.166919,0.979728,-0.632955
1,-0.297953,0.0,-1.365463
2,-0.120211,-0.540679,0.0
3,0.0,-2.027325,1.533582
4,0.0,0.0,0.461821
5,-0.788073,0.0,0.0
6,-0.91608,-0.612343,0.0
7,-0.887858,1.033826,0.0
8,1.94843,1.025011,-2.982224
9,0.019698,-0.795876,-0.046431


### Reference
-  [An awesome link](https://honingds.com/blog/pandas-read_csv/) to learn about how to use read_csv
- https://www.kdnuggets.com/2019/12/essential-toolbox-data-cleaning.html