In [2]:
#Missing Data & Its Handling

import pandas as pd
import numpy as np

#DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', None],
    'Age': [25, None, 30, 22],
    'Score': [85, 90, None, 88]
})

print("Original DataFrame:")
print(df)

Original DataFrame:
      Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
3     None  22.0   88.0


In [7]:
#  step#01 (checking missing data)

print("False where value exist:\n",df.isnull()) # Boolean DataFrame showing True where NaN(not a number) exists

print("\ncount number of NaNs:\n",df.isnull().sum())   # Count NaNs in each column

print("\ntrue where value exist:\n",df.notnull())      # Opposite of isnull()

False where value exist:
     Name    Age  Score
0  False  False  False
1  False   True  False
2  False  False   True
3   True  False  False

count number of NaNs:
 Name     1
Age      1
Score    1
dtype: int64

true where value exist:
     Name    Age  Score
0   True   True   True
1   True  False   True
2   True   True  False
3  False   True   True


In [11]:
# step#02 (Dropping Missing Data)

print(df.dropna())         # Drop rows have NaN

print("\n\n",df.dropna(axis=1))   # Drop columns have NaN

print("\n\n",df.dropna(thresh=2)) # Keep rows with at least 2 non-NaN values


    Name   Age  Score
0  Alice  25.0   85.0


 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


       Name   Age  Score
0    Alice  25.0   85.0
1      Bob   NaN   90.0
2  Charlie  30.0    NaN
3     None  22.0   88.0


In [16]:
# step#03 (Filling Missing Data)

print(df.fillna(0))             # Replace NaN with 0

print("\n\n",df.fillna(method='ffill'))  # Forward fill (fill whatever above it in table)

print("\n\n",df.fillna(method='bfill'))  # Backward fill (fill whatever below it in table)

      Name   Age  Score
0    Alice  25.0   85.0
1      Bob   0.0   90.0
2  Charlie  30.0    0.0
3        0  22.0   88.0


       Name   Age  Score
0    Alice  25.0   85.0
1      Bob  25.0   90.0
2  Charlie  30.0   90.0
3  Charlie  22.0   88.0


       Name   Age  Score
0    Alice  25.0   85.0
1      Bob  30.0   90.0
2  Charlie  30.0   88.0
3     None  22.0   88.0


  print("\n\n",df.fillna(method='ffill'))  # Forward fill (fill whatever above it in table)
  print("\n\n",df.fillna(method='bfill'))  # Backward fill (fill whatever below it in table)


In [17]:
#Concatenation
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

vertical =pd.concat([df1, df2], axis=0)  # Vertical (default)
horizontal =pd.concat([df1, df2], axis=1)  # Horizontal

print(vertical)
print(horizontal)

   A  B
0  1  3
1  2  4
0  5  7
1  6  8
   A  B  A  B
0  1  3  5  7
1  2  4  6  8


In [3]:
#merging
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'val1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'val2': [4, 5, 6]})

# Inner join
iner =pd.merge(df1, df2, on='key', how='inner')
print("Inner join:\n",iner)

# Outer join
outer =pd.merge(df1, df2, on='key', how='outer')
print("\nOuter join: \n",outer)

# Left join
lj =pd.merge(df1, df2, on='key', how='left')
print("\nLeft join: \n",lj)

# Right join
Rj =pd.merge(df1, df2, on='key', how='right')
print("\nRight join: \n",Rj)

Inner join:
   key  val1  val2
0   A     1     4
1   B     2     5

Outer join: 
   key  val1  val2
0   A   1.0   4.0
1   B   2.0   5.0
2   C   3.0   NaN
3   D   NaN   6.0

Left join: 
   key  val1  val2
0   A     1   4.0
1   B     2   5.0
2   C     3   NaN

Right join: 
   key  val1  val2
0   A   1.0     4
1   B   2.0     5
2   D   NaN     6


In [4]:
#joining
#Join (uses index)

df1.join(df2, lsuffix='_left', rsuffix='_right', how='inner')

Unnamed: 0,key_left,val1,key_right,val2
0,A,1,A,4
1,B,2,B,5
2,C,3,D,6


In [7]:
#GroupBy ---------- used to split data into groups, apply a function, and combine results.

df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Data': [10, 15, 20, 25]
})

# Group by Category and get mean
cat=df.groupby('Category').mean()
print("Group by Category\n",cat)

# Multiple aggregations
multi=df.groupby('Category').agg({'Data': ['sum', 'mean']})
print("\nMultiple aggregations:\n",multi)

# Apply custom function
cust=df.groupby('Category').apply(lambda x: x['Data'].max() - x['Data'].min())
print("\ncustom function:\n",cust)

Group by Category
           Data
Category      
A         12.5
B         22.5

Multiple aggregations:
          Data      
          sum  mean
Category           
A          25  12.5
B          45  22.5

custom function:
 Category
A    5
B    5
dtype: int64


  cust=df.groupby('Category').apply(lambda x: x['Data'].max() - x['Data'].min())
