<a href="https://colab.research.google.com/github/Mujthaba-GM/AIML-NLPProjects/blob/main/Data_Cleani.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Dropping NaN values with data frames
import pandas as pd
import numpy as np

# Step 1: Create a DataFrame with some NaN values
df = pd.DataFrame({
    'Emp_ID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', np.nan, 'David', 'Eve'],
    'Age': [25, np.nan, 30, 22, np.nan],
    'Salary': [50000, 60000, 55000, np.nan, 52000]
})

print("📋 Original DataFrame (with NaN values):\n")
print(df)

# Step 2: Drop rows with any NaN values
df_dropped = df.dropna()

print("\n✅ DataFrame after Dropping Rows with NaN:\n")
print(df_dropped)


📋 Original DataFrame (with NaN values):

   Emp_ID   Name   Age   Salary
0     101  Alice  25.0  50000.0
1     102    Bob   NaN  60000.0
2     103    NaN  30.0  55000.0
3     104  David  22.0      NaN
4     105    Eve   NaN  52000.0

✅ DataFrame after Dropping Rows with NaN:

   Emp_ID   Name   Age   Salary
0     101  Alice  25.0  50000.0


In [2]:
import pandas as pd

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"

# Load dataset
df = pd.read_csv(url)

print("✅ Titanic Dataset downloaded and loaded successfully!\n")
print(df.head())


✅ Titanic Dataset downloaded and loaded successfully!

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  


In [None]:
#Remove duplicates
import pandas as pd

# Step 1: Create a DataFrame with duplicate rows
df = pd.DataFrame({
    'Emp_ID': [101, 102, 103, 101, 104, 102],
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Bob'],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'IT', 'Finance']
})

print("📋 Original DataFrame (with duplicates):\n")
print(df)

# Step 2: Remove duplicate rows
df_no_duplicates = df.drop_duplicates()

print("\n✅ DataFrame after Removing Duplicates:\n")
print(df_no_duplicates)


📋 Original DataFrame (with duplicates):

   Emp_ID     Name Department
0     101    Alice         HR
1     102      Bob    Finance
2     103  Charlie         IT
3     101    Alice         HR
4     104    David         IT
5     102      Bob    Finance

✅ DataFrame after Removing Duplicates:

   Emp_ID     Name Department
0     101    Alice         HR
1     102      Bob    Finance
2     103  Charlie         IT
4     104    David         IT


In [1]:
#Fix incorrect data types
import pandas as pd

# Step 1️⃣ Create a simple DataFrame
df = pd.DataFrame({
    'Date': ['2024-01-01', '2024-02-15', 'oops'],
    'Age': ['25', 'thirty', '40']
})

print("📋 Original DataFrame:\n")
print(df)
print("\nData Types Before Correction:\n", df.dtypes)

# Step 2️⃣ Fix incorrect types
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

print("\n✅ DataFrame After Fixing Types:\n")
print(df)
print("\nData Types After Correction:\n", df.dtypes)



📋 Original DataFrame:

         Date     Age
0  2024-01-01      25
1  2024-02-15  thirty
2        oops      40

Data Types Before Correction:
 Date    object
Age     object
dtype: object

✅ DataFrame After Fixing Types:

        Date   Age
0 2024-01-01  25.0
1 2024-02-15   NaN
2        NaT  40.0

Data Types After Correction:
 Date    datetime64[ns]
Age            float64
dtype: object


In [None]:
#standardize
import pandas as pd

# Step 1: Create a DataFrame with inconsistent text cases
df = pd.DataFrame({
    'City': ['LONDON', 'London', 'london', 'LoNDoN', 'NEW YORK']
})

print("📋 Original DataFrame:\n")
print(df)

# Step 2: Standardize text to lowercase
df_standardized = df.copy()
df_standardized['City'] = df_standardized['City'].str.lower()

print("\n✅ After Standardizing Text Data (lowercase):\n")
print(df_standardized)


📋 Original DataFrame:

       City
0    LONDON
1    London
2    london
3    LoNDoN
4  NEW YORK

✅ After Standardizing Text Data (lowercase):

       City
0    london
1    london
2    london
3    london
4  new york


In [None]:
#IQR method outliers
import pandas as pd

# Step 1: Create a DataFrame with some outliers
df = pd.DataFrame({
    'Salary': [50000, 52000, 51000, 49000, 50500, 100000]  # 100000 is an outlier
})

print("📋 Original DataFrame:\n")
print(df)

# Step 2: Calculate IQR
Q1 = df['Salary'].quantile(0.25)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

# Step 3: Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 4: Filter out outliers
df_iqr_cleaned = df[(df['Salary'] >= lower_bound) & (df['Salary'] <= upper_bound)]

print("\n✅ DataFrame after Removing Outliers using IQR:\n")
print(df_iqr_cleaned)


📋 Original DataFrame:

   Salary
0   50000
1   52000
2   51000
3   49000
4   50500
5  100000

✅ DataFrame after Removing Outliers using IQR:

   Salary
0   50000
1   52000
2   51000
3   49000
4   50500
