In [1]:
import numpy as np
import pandas as pd

In [2]:
# NaN is short for Not a number. It is used to represent entries that are undefined. It is also used for representing missing values in a dataset
missing = np.nan

series_obj = pd.Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing, 'row 8'])
print(series_obj)

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object


In [3]:
# Test if there are any null values
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [4]:
# Produce random values and shape the result in 6x6 matrix
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape(6, 6))
print(DF_obj)

          0         1         2         3         4         5
0  0.870124  0.582277  0.278839  0.185911  0.411100  0.117376
1  0.684969  0.437611  0.556229  0.367080  0.402366  0.113041
2  0.447031  0.585445  0.161985  0.520719  0.326051  0.699186
3  0.366395  0.836375  0.481343  0.516502  0.383048  0.997541
4  0.514244  0.559053  0.034450  0.719930  0.421004  0.436935
5  0.281701  0.900274  0.669612  0.456069  0.289804  0.525819


In [5]:
# Add missing data
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [None]:
import pandas as pd
import numpy as np

data = {
    'column 1': [0.5, np.nan, 0.9],
    'column 2': [np.nan, 0.7, np.nan],
    'column 3': [0.3, np.nan, 0.1]
}
DF_obj = pd.DataFrame(data, index=['row 1', 'row 2', 'row 3'])

print("Original DataFrame:")
print(DF_obj)

filled_DF_zero = DF_obj.fillna(0)
print("\nDataFrame with missing values filled with 0:")
print(filled_DF_zero)

filled_DF_half = DF_obj.fillna(0.5)
print("\nDataFrame with missing values filled with 0.5:")
print(filled_DF_half)

filled_column_1 = DF_obj['column 1'].fillna(0)
print("\n'column 1' with missing values filled with 0:")
print(filled_column_1)

DF_obj['column 1'] = filled_column_1
print("\nUpdated DataFrame after filling 'column 1':")
print(DF_obj)

Original DataFrame:
       column 1  column 2  column 3
row 1       0.5       NaN       0.3
row 2       NaN       0.7       NaN
row 3       0.9       NaN       0.1

DataFrame with missing values filled with 0:
       column 1  column 2  column 3
row 1       0.5       0.0       0.3
row 2       0.0       0.7       0.0
row 3       0.9       0.0       0.1

DataFrame with missing values filled with 0.5:
       column 1  column 2  column 3
row 1       0.5       0.5       0.3
row 2       0.5       0.7       0.5
row 3       0.9       0.5       0.1

'column 1' with missing values filled with 0:
row 1    0.5
row 2    0.0
row 3    0.9
Name: column 1, dtype: float64

Updated DataFrame after filling 'column 1':
       column 1  column 2  column 3
row 1       0.5       NaN       0.3
row 2       0.0       0.7       NaN
row 3       0.9       NaN       0.1


In [None]:
import pandas as pd
import numpy as np

# Create a DataFrame with missing values
data = {
    0: [0.5, np.nan, 0.9],
    1: [np.nan, 0.7, np.nan],
    5: [0.3, np.nan, np.nan]
}
DF_obj = pd.DataFrame(data, index=['row 1', 'row 2', 'row 3'])

print("Original DataFrame:")
print(DF_obj)

# Fill missing values in column 0 with 0.1 and column 5 with 1.25
filled_DF = DF_obj.fillna({0: 0.1, 5: 1.25})

print("\nDataFrame after filling missing values in specific columns:")
print(filled_DF)

Original DataFrame:
         0    1    5
row 1  0.5  NaN  0.3
row 2  NaN  0.7  NaN
row 3  0.9  NaN  NaN

DataFrame after filling missing values in specific columns:
         0    1     5
row 1  0.5  NaN  0.30
row 2  0.1  0.7  1.25
row 3  0.9  NaN  1.25


In [8]:
np.random.seed(25)
DF_obj = pd.DataFrame(np.random.rand(36).reshape(6,6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [9]:
# Counting missing data
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

In [10]:
# Drop all the raws with missing values
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [None]:
# Drop all the column with missing values
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

# Find out what axis=1 means? Refers to the rows

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804


In [12]:
import pandas as pd
import numpy as np

# Task 1: Create the DataFrame
data = {
    'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'],
    'Revenue': [1000, 1200, np.nan, np.nan, 1500]
}
sales_data = pd.DataFrame(data)

print("Original DataFrame:")
print(sales_data)

# Task 2: Fill NaN values in the 'Revenue' column with 800
sales_data['Revenue'] = sales_data['Revenue'].fillna(800)

print("\nDataFrame after filling NaN values:")
print(sales_data)


Original DataFrame:
         Date  Revenue
0  2023-01-01   1000.0
1  2023-01-02   1200.0
2  2023-01-03      NaN
3  2023-01-04      NaN
4  2023-01-05   1500.0

DataFrame after filling NaN values:
         Date  Revenue
0  2023-01-01   1000.0
1  2023-01-02   1200.0
2  2023-01-03    800.0
3  2023-01-04    800.0
4  2023-01-05   1500.0
