In [1]:
import pandas as pd
from io import StringIO
import sys
  
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

In [2]:
# Step 1: Read the csv file as a pandas dataframe

In [3]:
df = pd.read_csv(StringIO(csv_data))

In [4]:
# Step 2: Check the number of missing values for the columns

In [5]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [6]:
# Step 3: Access the underlying NumPy array via the values attribute

In [7]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [8]:
# Step 4: Remove rows from df that contain missing values

In [9]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [10]:
# Step 5: Remove columns from df that contain missing values

In [11]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [12]:
# Step 6: Only drop rows where all columns are NaN

In [13]:
df[~(df.isnull().sum(axis=1) == len(df.columns))]

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [14]:
# Step 7: Drop rows that have less than 3 real values

In [15]:
df[~(df.isnull().sum(axis=1) > len(df.columns)-3)]

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [16]:
# Step 8: Only drop rows where NaN appear in specific columns (here: 'C')


In [17]:
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [18]:
# B. Imputing missing values

In [19]:
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_data = imputer.fit_transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])