Handling Missing Data

In [None]:
# Identifying missing data
df.isnull().sum()  # Count nulls per column
df.isnull().sum() / len(df) * 100  # Percentage of nulls

# SQL: SELECT * FROM table WHERE column IS NOT NULL
df.dropna(subset=['important_column'])

# SQL: UPDATE table SET column = 'Default' WHERE column IS NULL
df['column'].fillna('Default Value', inplace=True)

# Fill with statistical measures
df['numeric_column'].fillna(df['numeric_column'].mean(), inplace=True)
df['numeric_column'].fillna(df['numeric_column'].median(), inplace=True)

# Forward fill and backward fill
df['column'].fillna(method='ffill')  # Use previous valid value
df['column'].fillna(method='bfill')  # Use next valid value

# Interpolation for numeric data
df['numeric_column'].interpolate()

In [11]:
import pandas as pd
import seaborn as sns

# Load and inspect a dataset
dataset = 'taxis'
df = sns.load_dataset(dataset)
print(f"\nLoaded '{dataset}' dataset:")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())




Loaded 'taxis' dataset:
Shape: (6433, 14)

Columns: ['pickup', 'dropoff', 'passengers', 'distance', 'fare', 'tip', 'tolls', 'total', 'color', 'payment', 'pickup_zone', 'dropoff_zone', 'pickup_borough', 'dropoff_borough']

First few rows:
               pickup             dropoff  passengers  distance  fare   tip  \
0 2019-03-23 20:21:09 2019-03-23 20:27:24           1      1.60   7.0  2.15   
1 2019-03-04 16:11:55 2019-03-04 16:19:00           1      0.79   5.0  0.00   
2 2019-03-27 17:53:01 2019-03-27 18:00:25           1      1.37   7.5  2.36   
3 2019-03-10 01:23:59 2019-03-10 01:49:51           1      7.70  27.0  6.15   
4 2019-03-30 13:27:42 2019-03-30 13:37:14           3      2.16   9.0  1.10   

   tolls  total   color      payment            pickup_zone  \
0    0.0  12.95  yellow  credit card        Lenox Hill West   
1    0.0   9.30  yellow         cash  Upper West Side South   
2    0.0  14.16  yellow  credit card          Alphabet City   
3    0.0  36.95  yellow  credit ca