# **Day-10 : Pandas Data Cleaning**

In [17]:
import pandas as pd

data = {'A': [10, 20, None, 30, 40],
        'B': [None, 'chennai', 'coimbatore', 'london', 'america']}

df = pd.DataFrame(data)
print(df)

      A           B
0  10.0         NaN
1  20.0     chennai
2   NaN  coimbatore
3  30.0      london
4  40.0     america


In [18]:
# Handling Missing Values : Dropping rows or columns with missing values.
clean_df = df.dropna()
print(clean_df)

      A        B
1  20.0  chennai
3  30.0   london
4  40.0  america


In [19]:
# filling missing value of A with the mean of the columns
df['A'].fillna(df['A'].mean(), inplace=True)
print(df)

      A           B
0  10.0         NaN
1  20.0     chennai
2   NaN  coimbatore
3  30.0      london
4  40.0     america


C:\Users\Lenovo\AppData\Local\Temp\ipykernel_33152\1792550960.py:2: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df['A'].fillna(df['A'].mean(), inplace=True)


In [20]:
# Removing Duplicates : Removing duplicate rows.
x1 = df.drop_duplicates()
print(x1)

      A           B
0  10.0         NaN
1  20.0     chennai
2   NaN  coimbatore
3  30.0      london
4  40.0     america


In [24]:
# Sample data
data = {'A' : [10,20,30,40,50]}
df = pd.DataFrame(data)
data1 = {'B' : ['Chennai', 'Coimbatore', 'Madurai', 'Trichy', 'Salem']}
df1 = pd.DataFrame(data1)

print(df)
print(df1)

    A
0  10
1  20
2  30
3  40
4  50
            B
0     Chennai
1  Coimbatore
2     Madurai
3      Trichy
4       Salem


In [26]:
# Data Type Conversion : Converting data types.
df['A'] = df['A'].astype(float)
print(df)

      A
0  10.0
1  20.0
2  30.0
3  40.0
4  50.0


In [29]:
# Convert 'B' column to uppercase
df1['B'] = df1['B'].str.lower()
print(df1)

            B
0     chennai
1  coimbatore
2     madurai
3      trichy
4       salem


In [37]:
# Removing Irrelevant Columns
# Sample
data = {'A': [1, 2, 3, 4, 5],
        'B': ['apple', 'banana', 'cherry', 'date', 'chocolate'],
        'C': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
print(df)

# Remove the 'C' column
print("After removing")
df.drop('C', axis=1, inplace=True)
print(df)


   A          B   C
0  1      apple  10
1  2     banana  20
2  3     cherry  30
3  4       date  40
4  5  chocolate  50
After removing
   A          B
0  1      apple
1  2     banana
2  3     cherry
3  4       date
4  5  chocolate


In [38]:
# Data transformation
#apply()
data = {'A': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

def double_value(x):
    return x * 2

df['A_doubled'] = df['A'].apply(double_value)
print(df)

    A  A_doubled
0  10         20
1  20         40
2  30         60
3  40         80
4  50        100


In [32]:
# map()
data = {'Category': ['A', 'B', 'A', 'C', 'B']}
df = pd.DataFrame(data)

category_mapping = {'A': 1, 'B': 2, 'C': 3}

df['Category_Num'] = df['Category'].map(category_mapping)
print(df)

  Category  Category_Num
0        A             1
1        B             2
2        A             1
3        C             3
4        B             2


In [None]:
# applymap()
data = {'A': [1, 2, 3],
        'B': [4, 5, 6]}
df = pd.DataFrame(data)

def square(x):
    return x ** 2

df_squared = df.applymap(square)
print(df_squared)