In [100]:
%config IPCompleter.greedy=True
import pandas as pd
from io import StringIO

In [101]:
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [102]:
# Checks how many values are null in the dataframe for each variable
df.isnull().sum() 

A    0
B    0
C    1
D    1
dtype: int64

In [103]:
# Dropping rows with NA values
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [104]:
# Dropping columns with NA values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [105]:
# Drops rows with all NaN values
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [106]:
# Drops all rows with less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [107]:
# Drops all rows where NaN shows in specific columns (C in this case)
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [108]:
# Instead of removing the NaN values, lets use mean imputation
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
print(imputed_data)

[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]


In [109]:
# Alternative way of filling the NA values
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [110]:
df2 = pd.DataFrame([
    ['green', 'M', 10.1, 'class2'],
    ['red', 'L', 13.5, 'class1'],
    ['blue', 'XL', 15.3, 'class2']])
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [111]:
# We can map ordinal features to become numerical and hold their precedence 
size_mapping = {'XL':3,
                             'L':2,
                             'M':1}

In [112]:
# How to swap our ordinal mapping into the df
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [113]:
# Swapping the numerical back to ordinal
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df2['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [114]:
# Encoding class labels
class_mapping = {label: idx for idx, label in
                                enumerate(np.unique(df2['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [115]:
df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [116]:
# Inverting the encoded classes
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df2['classlabel'] = df2['classlabel'].map(inv_class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [118]:
# More convenient way 
from sklearn.preprocessing import LabelEncoder
class_1e = LabelEncoder()
y = class_1e.fit_transform(df2['classlabel'].values)
y

array([1, 0, 1])

In [119]:
class_1e.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)