3.Select a dataset with missing values and outliers. Apply techniques to clean and preprocess the data using Pandas. Impute missing values and handle outliers appropriately.



In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from scipy.stats import zscore

In [4]:
#load dataset
df = pd.read_csv("C:\Latest Covid-19 Data in Asia.csv")

In [5]:
# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country/Other      50 non-null     object 
 1   Total Cases        50 non-null     int64  
 2   Total Deaths       50 non-null     int64  
 3   Total Recovered    47 non-null     float64
 4   Active Cases       47 non-null     float64
 5   Tot Cases/ 1M pop  50 non-null     int64  
 6   Deaths/ 1M pop     50 non-null     int64  
 7   Total Tests        48 non-null     float64
 8   Tests/ 1M pop      48 non-null     float64
 9   Population         50 non-null     int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 4.0+ KB
None


In [6]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
Country/Other        0
Total Cases          0
Total Deaths         0
Total Recovered      3
Active Cases         3
Tot Cases/ 1M pop    0
Deaths/ 1M pop       0
Total Tests          2
Tests/ 1M pop        2
Population           0
dtype: int64


In [7]:
# Handle missing values
# Assuming you want to fill missing values in numeric columns with the mean
numeric_cols = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [8]:
# Check again for missing values after imputation
print("\nMissing values after imputation:")
print(df.isnull().sum())


Missing values after imputation:
Country/Other        0
Total Cases          0
Total Deaths         0
Total Recovered      0
Active Cases         0
Tot Cases/ 1M pop    0
Deaths/ 1M pop       0
Total Tests          0
Tests/ 1M pop        0
Population           0
dtype: int64


In [9]:
# Handle outliers
# Assuming you want to use Z-score to identify and remove outliers in numeric columns
z_scores = np.abs(zscore(df[numeric_cols]))
threshold = 3
outliers_mask = (z_scores > threshold).any(axis=1)
df_no_outliers = df[~outliers_mask]

In [10]:
# Display the number of removed outliers
print(f"\nNumber of outliers removed: {df.shape[0] - df_no_outliers.shape[0]}")



Number of outliers removed: 7


In [13]:
# save the cleaned dataset to a new CSV file
df_no_outliers.to_csv('cleaned_dataset.csv', index=False)

In [16]:
df

Unnamed: 0,Country/Other,Total Cases,Total Deaths,Total Recovered,Active Cases,Tot Cases/ 1M pop,Deaths/ 1M pop,Total Tests,Tests/ 1M pop,Population
0,Afghanistan,225850.0,7946.0,206436.0,11468.0,5542.0,195.0,1305526.0,32034.0,40754390.0
1,Armenia,449357.0,8751.0,435162.0,5444.0,151199.0,2945.0,3242901.0,1091164.0,2971966.0
2,Azerbaijan,832580.0,10302.0,822131.0,147.0,80831.0,1000.0,7680488.0,745664.0,10300200.0
3,Bahrain,728053.0,1573.0,726036.0,444.0,408105.0,882.0,10919040.0,6120596.0,1783983.0
4,Bangladesh,2045517.0,29477.0,1998448.0,17592.0,12184.0,176.0,15254400.0,90862.0,167885700.0
5,Bhutan,62697.0,21.0,61564.0,1112.0,79571.0,27.0,2303734.0,2923739.0,787941.0
6,Brunei,310522.0,225.0,243601.0,66696.0,697127.0,505.0,717784.0,1611437.0,445431.0
7,Cambodia,138940.0,3056.0,135884.0,0.0,8093.0,178.0,3091420.0,180062.0,17168640.0
8,China,503302.0,5272.0,379053.0,118977.0,347.0,4.0,160000000.0,110461.0,1448471000.0
9,Cyprus,660854.0,1364.0,659490.0,0.0,540184.0,1115.0,9640118.0,7879860.0,1223387.0
