In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
# Load the dataset into a DataFrame
# Specify the file path
file_path = '1.scraped_airquality_data_stage1.csv'

# Load the scraped data from the CSV file
df = pd.read_csv(file_path)

In [3]:
# Display the first few rows of the DataFrame
print(df.head())

   Year  Country Air Pollutant  Population  Populated Area [km2]  \
0  2018  Austria           NO2     8822183                 43050   
1  2018  Austria            O3     8822183                 43050   
2  2018  Austria          PM10     8822183                 43050   
3  2018  Austria         PM2.5     8822183                 43050   
4  2019  Austria           NO2     8858695                 43050   

   Air Pollution Average [ug/m3]  Premature Deaths  Years Of Life Lost  
0                            9.5            1228.0             12796.0  
1                         7867.9             619.0              6756.0  
2                           15.4               NaN                 NaN  
3                           11.2            5001.0             52145.0  
4                            8.7            1038.0             10735.0  


In [4]:
# Extract and print the unique values from the 'Country' column.
unique_countries_df = print(sorted(df['Country'].unique()))

['Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czechia', 'Denmark', 'Estonia', 'European Union Countries', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands', 'Poland', 'Slovakia', 'Slovenia', 'Spain']


In [5]:
# Adding Impurities to the Dataset
# The following section of the code is designed to introduce some impurities into our data. It is added to simulate
# conditions where datasets may not be perfect or may contain various types of noise or errors:

In [8]:
# Impurity 1: Adding Duplicates

In [9]:
# Create a list of random row indices to duplicate
num_duplicates = 20  # Adjust the number of duplicates as needed
duplicate_indices = random.choices(range(len(df)), k=num_duplicates)

# Append the selected duplicate rows to the DataFrame
duplicate_rows = df.iloc[duplicate_indices]
df = pd.concat([df, duplicate_rows], ignore_index=True)

In [10]:
# Impurity 2: Changing Data Types Randomly

In [11]:
def change_data_types(df, num_columns_to_change=3):
    """
    Randomly selects a specified number of columns and changes their data types to int, float, or str.
    The function skips columns with non-numeric values when changing to int or float types.
    """ 
    columns_to_change = random.sample(df.columns.tolist(), num_columns_to_change)
    changed_columns = []

    for column_name in columns_to_change:
        new_data_type = random.choice([int, float, str])

        # Check if the current column contains non-numeric values
        if not pd.to_numeric(df[column_name], errors='coerce').notna().all():
            continue

        if new_data_type == int:
            df[column_name] = df[column_name].astype(int)
        elif new_data_type == float:
            df[column_name] = df[column_name].astype(float)
        elif new_data_type == str:
            df[column_name] = df[column_name].astype(str)
        
        changed_columns.append((column_name, new_data_type.__name__))

    return df, changed_columns

# Using the function on the DataFrame
df_with_changed_types, changed_columns_info = change_data_types(df.copy(), num_columns_to_change=5)

# Printing the columns that had their types changed
changed_columns_info

[('Populated Area [km2]', 'float'),
 ('Air Pollution Average [ug/m3]', 'str'),
 ('Population', 'str'),
 ('Year', 'float')]

In [12]:
# Impurity 3:Changing Year formate

In [13]:
# Adding impurity: Inconsistent Formats in 'Year'
year_indices = random.sample(range(len(df)), 10)  # Randomly select 10 indices
df.loc[year_indices, 'Year'] = df.loc[year_indices, 'Year'].apply(lambda x: f"{x} year")

# Displaying the modified DataFrame to verify the changes
df.head()

 '2018 year' '2019 year' '2018 year' '2018 year']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[year_indices, 'Year'] = df.loc[year_indices, 'Year'].apply(lambda x: f"{x} year")


Unnamed: 0,Year,Country,Air Pollutant,Population,Populated Area [km2],Air Pollution Average [ug/m3],Premature Deaths,Years Of Life Lost
0,2018,Austria,NO2,8822183,43050,9.5,1228.0,12796.0
1,2018,Austria,O3,8822183,43050,7867.9,619.0,6756.0
2,2018,Austria,PM10,8822183,43050,15.4,,
3,2018,Austria,PM2.5,8822183,43050,11.2,5001.0,52145.0
4,2019,Austria,NO2,8858695,43050,8.7,1038.0,10735.0


In [14]:
# Impurity 4 : Changing Country names to lowercase

In [15]:
# Introducing an impurity by changing some random country names to lowercase
lowercase_indices = random.sample(range(len(df)), 10)  # Randomly select 10 indices
df.loc[lowercase_indices, 'Country'] = df.loc[lowercase_indices, 'Country'].str.lower()

# Displaying the modified DataFrame to verify the changes
df.head()


Unnamed: 0,Year,Country,Air Pollutant,Population,Populated Area [km2],Air Pollution Average [ug/m3],Premature Deaths,Years Of Life Lost
0,2018,Austria,NO2,8822183,43050,9.5,1228.0,12796.0
1,2018,austria,O3,8822183,43050,7867.9,619.0,6756.0
2,2018,Austria,PM10,8822183,43050,15.4,,
3,2018,Austria,PM2.5,8822183,43050,11.2,5001.0,52145.0
4,2019,Austria,NO2,8858695,43050,8.7,1038.0,10735.0


In [16]:
# Impurity 5: Adding Text Suffixes to Numeric Columns

In [17]:
# Adding impurity: appending " km" to random values in 'Populated Area [km2]' and "ug/m" to 'Air Pollution Average [ug/m3]'
area_indices = random.sample(range(len(df)), 10)  # Randomly select 10 indices for area
pollution_indices = random.sample(range(len(df)), 10)  # Randomly select 10 indices for pollution

df.loc[area_indices, 'Populated Area [km2]'] = df.loc[area_indices, 'Populated Area [km2]'].astype(str) + " km"
df.loc[pollution_indices, 'Air Pollution Average [ug/m3]'] = df.loc[pollution_indices, 'Air Pollution Average [ug/m3]'].astype(str) + "ug/m"

# Displaying the modified DataFrame to verify the changes
df.head()

 '15868 km' '6504 km' '44196 km' '32912 km']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[area_indices, 'Populated Area [km2]'] = df.loc[area_indices, 'Populated Area [km2]'].astype(str) + " km"
 '11.8ug/m' '13.3ug/m' '10.9ug/m' '3515.7ug/m']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df.loc[pollution_indices, 'Air Pollution Average [ug/m3]'] = df.loc[pollution_indices, 'Air Pollution Average [ug/m3]'].astype(str) + "ug/m"


Unnamed: 0,Year,Country,Air Pollutant,Population,Populated Area [km2],Air Pollution Average [ug/m3],Premature Deaths,Years Of Life Lost
0,2018,Austria,NO2,8822183,43050,9.5,1228.0,12796.0
1,2018,austria,O3,8822183,43050,7867.9ug/m,619.0,6756.0
2,2018,Austria,PM10,8822183,43050,15.4,,
3,2018,Austria,PM2.5,8822183,43050,11.2,5001.0,52145.0
4,2019,Austria,NO2,8858695,43050,8.7,1038.0,10735.0


In [18]:
# Save the updated DataFrame to a new CSV file
output_file_path = '2.air_quality_data_with_impurities_stage2.csv'
df.to_csv(output_file_path, index=False)