<a href="https://colab.research.google.com/github/Mujthaba-GM/Student-new1234/blob/main/Data_Cleani.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Dropping NaN values with data frames
import pandas as pd
import numpy as np

# Step 1: Create a DataFrame with some NaN values
df = pd.DataFrame({
    'Emp_ID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', np.nan, 'David', 'Eve'],
    'Age': [25, np.nan, 30, 22, np.nan],
    'Salary': [50000, 60000, 55000, np.nan, 52000]
})

print("📋 Original DataFrame (with NaN values):\n")
print(df)

# Step 2: Drop rows with any NaN values
df_dropped = df.dropna()

print("\n✅ DataFrame after Dropping Rows with NaN:\n")
print(df_dropped)


📋 Original DataFrame (with NaN values):

   Emp_ID   Name   Age   Salary
0     101  Alice  25.0  50000.0
1     102    Bob   NaN  60000.0
2     103    NaN  30.0  55000.0
3     104  David  22.0      NaN
4     105    Eve   NaN  52000.0

✅ DataFrame after Dropping Rows with NaN:

   Emp_ID   Name   Age   Salary
0     101  Alice  25.0  50000.0


In [None]:
#Remove duplicates
import pandas as pd

# Step 1: Create a DataFrame with duplicate rows
df = pd.DataFrame({
    'Emp_ID': [101, 102, 103, 101, 104, 102],
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob'],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'IT', 'Finance']
})

print("📋 Original DataFrame (with duplicates):\n")
print(df)

# Step 2: Remove duplicate rows
df_no_duplicates = df.drop_duplicates()

print("\n✅ DataFrame after Removing Duplicates:\n")
print(df_no_duplicates)


📋 Original DataFrame (with duplicates):

   Emp_ID     Name Department
0     101    Alice         HR
1     102      Bob    Finance
2     103  Charlie         IT
3     101    Alice         HR
4     104    David         IT
5     102      Bob    Finance

✅ DataFrame after Removing Duplicates:

   Emp_ID     Name Department
0     101    Alice         HR
1     102      Bob    Finance
2     103  Charlie         IT
4     104    David         IT


In [None]:
#Incorrect data types
import pandas as pd

# Step 1: Create a DataFrame with string values
df = pd.DataFrame({
    'JoinDate': ['2023-01-01', '2022-05-20', 'invalid_date'],
    'Emp_Age': ['30', '25', 'not_available']
})

print("📋 Original DataFrame with String Types:\n")
print(df)
print("\nData Types Before Correction:\n", df.dtypes)

# Step 2: Convert string to correct data types
df_corrected = df.copy()

# Convert JoinDate to datetime (errors='coerce' turns invalid to NaT)
df_corrected['JoinDate'] = pd.to_datetime(df_corrected['JoinDate'], errors='coerce')

# Convert Emp_Age to integer (errors='coerce' turns invalid to NaN)
df_corrected['Emp_Age'] = pd.to_numeric(df_corrected['Emp_Age'], errors='coerce')

print("\n✅ DataFrame After Type Conversion:\n")
print(df_corrected)
print("\nData Types After Correction:\n", df_corrected.dtypes)


📋 Original DataFrame with String Types:

       JoinDate        Emp_Age
0    2023-01-01             30
1    2022-05-20             25
2  invalid_date  not_available

Data Types Before Correction:
 JoinDate    object
Emp_Age     object
dtype: object

✅ DataFrame After Type Conversion:

    JoinDate  Emp_Age
0 2023-01-01     30.0
1 2022-05-20     25.0
2        NaT      NaN

Data Types After Correction:
 JoinDate    datetime64[ns]
Emp_Age            float64
dtype: object


In [None]:
#standardize
import pandas as pd

# Step 1: Create a DataFrame with inconsistent text cases
df = pd.DataFrame({
    'City': ['LONDON', 'London', 'london', 'LoNDoN', 'NEW YORK']
})

print("📋 Original DataFrame:\n")
print(df)

# Step 2: Standardize text to lowercase
df_standardized = df.copy()
df_standardized['City'] = df_standardized['City'].str.lower()

print("\n✅ After Standardizing Text Data (lowercase):\n")
print(df_standardized)


📋 Original DataFrame:

       City
0    LONDON
1    London
2    london
3    LoNDoN
4  NEW YORK

✅ After Standardizing Text Data (lowercase):

       City
0    london
1    london
2    london
3    london
4  new york


In [None]:
#IQR method outliers
import pandas as pd

# Step 1: Create a DataFrame with some outliers
df = pd.DataFrame({
    'Salary': [50000, 52000, 51000, 49000, 50500, 100000]  # 100000 is an outlier
})

print("📋 Original DataFrame:\n")
print(df)

# Step 2: Calculate IQR
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

# Step 3: Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 4: Filter out outliers
df_iqr_cleaned = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]

print("\n✅ DataFrame after Removing Outliers using IQR:\n")
print(df_iqr_cleaned)


📋 Original DataFrame:

   Salary
0   50000
1   52000
2   51000
3   49000
4   50500
5  100000

✅ DataFrame after Removing Outliers using IQR:

   Salary
0   50000
1   52000
2   51000
3   49000
4   50500
