In [10]:
import pandas as pd
import os

# Step 1: Define the file path for the dataset (Replace with your actual file path)
# Example: Replace 'your_dataset.csv' with the actual path to your dataset file
file_path = 'path/to/your/dataset.csv'  # Update this path accordingly

# Step 2: Check if the file exists at the provided location
if os.path.exists(file_path):
    print(f"✅ File found at: {os.path.abspath(file_path)}")
    
    # Step 3: Load the dataset into a pandas DataFrame
    df = pd.read_csv(file_path)
    print("✅ Dataset loaded successfully.")

    # Step 4: Perform Basic Data Quality Checks
    
    # (a) Basic Overview: Data types and non-null counts
    print("\n✅ Basic Data Overview:")
    print(df.info())  # This will give the column names, types, and non-null counts
    
    # (b) Show the first 5 rows of the dataset
    print("\n✅ First 5 Rows of the Dataset:")
    print(df.head())
    
    # (c) Missing Values Check
    print("\n✅ Missing Values in Each Column:")
    print(df.isnull().sum())  # Check the number of missing values in each column
    
    # (d) Duplicates Check
    print("\n✅ Duplicates in the Dataset:")
    duplicate_count = df.duplicated().sum()  # Count the number of duplicate rows
    print(f"Total Duplicates: {duplicate_count}")
    
    if duplicate_count > 0:
        df_no_duplicates = df.drop_duplicates()  # Remove duplicates
        print("✅ Duplicates Removed. New Shape of the DataFrame:", df_no_duplicates.shape)
    else:
        print("✅ No duplicates found.")
    
    # (e) Basic Statistical Summary for Numerical Columns
    print("\n✅ Statistical Summary of the Dataset (Numerical Columns):")
    print(df.describe())  # Show statistics for numerical columns like mean, median, etc.
    
    # (f) Checking for Outliers using IQR (Interquartile Range) method
    # For numerical columns, let's identify potential outliers
    print("\n✅ Outliers Detection using IQR:")
    Q1 = df.quantile(0.25)  # 25th percentile
    Q3 = df.quantile(0.75)  # 75th percentile
    IQR = Q3 - Q1
    outliers = ((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).sum()
    print(outliers)  # This will show the number of outliers for each column
    
else:
    # If the file is not found, print an error message
    print(f"❌ The dataset file was not found at the path: {os.path.abspath(file_path)}")
   

❌ The dataset file was not found at the path: /workspaces/AI_DATA_ANALYSIS_/src/Module 4/Advanced Data Quality & Validation/path/to/your/dataset.csv
