In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
visa_df = pd.read_csv(r"C:\Users\Acer\OneDrive\Data Science\Data Analysis\Numpy\Visadataset.csv")
cat = visa_df.select_dtypes(include = 'object').columns
num = visa_df.select_dtypes(exclude = 'object').columns

In [4]:
Data = {'Names': ['Ramesh','Suresh','Mahesh','Satish'],
        'Age': [np.nan,22,25,27],
        'City': ['Pune','Pune',np.nan,'Hyd']
       }
df =pd.DataFrame(Data)
df

Unnamed: 0,Names,Age,City
0,Ramesh,,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,
3,Satish,27.0,Hyd


In [4]:
df.to_csv('data_missing.csv')

In [5]:
Data = {'Names': ['Ramesh','Suresh','Mahesh',None],
        'Age': [None,22,25,27],
        'City': ['Pune','Pune',None,'Hyd']
       }
df1 =pd.DataFrame(Data)
df1
df1.to_csv('data_missing1.csv')

In [6]:
df.dtypes

Names     object
Age      float64
City      object
dtype: object

In [7]:
df1.dtypes

Names     object
Age      float64
City      object
dtype: object

In [8]:
df.isnull()

Unnamed: 0,Names,Age,City
0,False,True,False
1,False,False,False
2,False,False,True
3,False,False,False


In [9]:
df1.isnull()

Unnamed: 0,Names,Age,City
0,False,True,False
1,False,False,False
2,False,False,True
3,True,False,False


In [10]:
df.isnull().sum()

Names    0
Age      1
City     1
dtype: int64

# fillna

In [11]:
df.fillna(30) # Not recommended as strings also replaced by number

Unnamed: 0,Names,Age,City
0,Ramesh,30.0,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,30
3,Satish,27.0,Hyd


# What is the difference between np.nan and None

In [2]:
np.nan + 5

nan

In [3]:
None + 5

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [13]:
arr1 = np.array([1,2,np.nan,3])
arr2 = np.array([1,2,None,3])


In [21]:
arr1

array([ 1.,  2., nan,  3.])

In [22]:
arr2

array([1, 2, None, 3], dtype=object)

In [25]:
np.sum(arr1), np.sum(arr1)

(nan, nan)

In [26]:
np.nansum(arr1), np.sum(arr1)

(6.0, nan)

In [27]:
np.sum(arr1)

nan

    - np.nan represents **float type**

    - None represents **Object type**

    - if data operations related to Maths, then nan format is better

# Fill the data based on column

In [14]:
# df.fillna()
df1['Names'].fillna('Manish')

0    Ramesh
1    Suresh
2    Mahesh
3    Manish
Name: Names, dtype: object

In [15]:
df1

Unnamed: 0,Names,Age,City
0,Ramesh,,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,
3,,27.0,Hyd


# Methods

    - pad

    - bfill

    - ffill

    - backfill

In [16]:
import warnings
warnings.filterwarnings('ignore')
df1.fillna(method='pad') # Taking the value of above columns. 

Unnamed: 0,Names,Age,City
0,Ramesh,,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,Pune
3,Mahesh,27.0,Hyd


In [36]:
import warnings
warnings.filterwarnings('ignore')
df1.fillna(method='bfill') # Taking the value of below columns.

Unnamed: 0,Names,Age,City
0,Ramesh,22.0,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,Hyd
3,,27.0,Hyd


In [21]:
import warnings
warnings.filterwarnings('ignore')
df1.fillna(method='ffill') # Taking the value of below columns.

Unnamed: 0,Names,Age,City
0,Ramesh,,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,Pune
3,Mahesh,27.0,Hyd


In [None]:
idea-1: filling with random value = df.fillname(<random value>)
idea-2: filling with random value based on column = df[<col>].filename(<random value>)
idea-3: filling with some pattern using a method = df.fillna(<method>)
idea-4: filling with mean-median-mode based on column= df[<column>]fillname(<mean> or <med> or <mode>)
idea-5 : filling with avg value only selected neighbours
idea-6 : filling with a value based on corelation with another columns

In [18]:
df1.fillna(method='pad',axis =1)

Unnamed: 0,Names,Age,City
0,Ramesh,Ramesh,Pune
1,Suresh,22.0,Pune
2,Mahesh,25.0,25.0
3,,27.0,Hyd


# Mean-Median-Mode

In [10]:
data1 = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [30, np.nan, 33, np.nan],
    'City': ['Hyd','nan','Hyd','Blr']
})
age_mean = data1['Age'].mean() # fixed Value
print('age_mean',age_mean)
data1['Age'].fillna(age_mean)

age_mean 31.5


0    30.0
1    31.5
2    33.0
3    31.5
Name: Age, dtype: float64

In [11]:
city_mode = data1['City'].mode() # Series
data1['City'].fillna(city_mode)

0    Hyd
1    nan
2    Hyd
3    Blr
Name: City, dtype: object

- In future sometime if we direct impute mode value we will get error

- mode is coming as series

- mean is coming as fixed values

- so first convert series to fixed value then apply it

In [12]:
city_mode.values[0]

'Hyd'

In [14]:
city_mode = data1['City'].mode()
city_mode

0    Hyd
Name: City, dtype: object

#  impute clsaa
- under sklearn

In [15]:
from sklearn import impute

In [16]:
dir(impute)

['KNNImputer',
 'MissingIndicator',
 'SimpleImputer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_knn',
 'typing']

# KNN imputer

- K - Nearest Neighbours

- where k = hyper parameter which means the user can change

- idea: Instead of raking all samples average

        - first fix the neighbours number ie K

        - then tind the neighbours using distance metric

        - then take the average of those samples to fill the missing values

In [18]:
import numpy as np
from sklearn.impute import KNNImputer
X = [[1,2,np.nan], [3,np.nan, 3], [np.nan,60,5],[8,8,7]]
pd.DataFrame(X)

Unnamed: 0,0,1,2
0,1.0,2.0,
1,3.0,,3.0
2,,60.0,5.0
3,8.0,8.0,7.0


In [28]:
import numpy as np
from sklearn.impute import KNNImputer
X = [[1,2,np.nan], [3,np.nan, 3], [np.nan,60,5],[8,8,7]]
imputer = KNNImputer(n_neighbors = 2, weights = 'distance')
imputer.fit_transform(X)

array([[ 1.        ,  2.        ,  3.93905505],
       [ 3.        , 31.        ,  3.        ],
       [ 3.25775362, 60.        ,  5.        ],
       [ 8.        ,  8.        ,  7.        ]])

In [None]:
- weights = 'uniform' # Every value has equal importance

In [None]:
# Learn the following
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder
le.fit_transform()

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.impute import KNNImputer