In [9]:
# 1. Importing Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# 2. Loading the Dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your actual dataset path

# 3. Exploratory Data Analysis (EDA)
print("First five rows of the dataset:")
print(df.head())

print("\nDataset Information:")
print(df.info())

print("\nStatistical Summary:")
print(df.describe())

print("\nMissing Values in Each Column:")
print(df.isnull().sum())

# Visualizing Missing Values
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# 4. Handling Missing Values
# Separating numerical and categorical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Imputing missing values
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df[numerical_cols] = numerical_imputer.fit_transform(df[numerical_cols])
df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

# 5. Removing Duplicates
df = df.drop_duplicates()

# 6. Data Type Conversion
# Example: Converting 'date_column' to datetime format
# df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')

# 7. Outlier Detection and Removal using IQR
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# 8. Encoding Categorical Variables
# Applying One-Hot Encoding to categorical columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# 9. Feature Scaling
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# 10. Saving the Cleaned Dataset
df.to_csv('cleaned_dataset.csv', index=False)
print("\nData preprocessing completed. Cleaned dataset saved as 'cleaned_dataset.csv'.")

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'