In [12]:
import pandas as pd

# Step 1: Convert the CSV file into a DataFrame
df = pd.read_csv('air-pollution.csv')
print("Original DataFrame:")
print(df)



Original DataFrame:
            Entity Code  Year  Nitrogen oxide (NOx)  \
0      Afghanistan  AFG  1750             555.47860   
1      Afghanistan  AFG  1760             578.50757   
2      Afghanistan  AFG  1770             602.47980   
3      Afghanistan  AFG  1780             627.43220   
4      Afghanistan  AFG  1790             653.40310   
...            ...  ...   ...                   ...   
48220     Zimbabwe  ZWE  2018           82210.49000   
48221     Zimbabwe  ZWE  2019           79547.03000   
48222     Zimbabwe  ZWE  2020           68338.91400   
48223     Zimbabwe  ZWE  2021           71148.93000   
48224     Zimbabwe  ZWE  2022           73363.71000   

       Sulphur dioxide (SO₂) emissions  Carbon monoxide (CO) emissions  \
0                            174.87167                       142073.31   
1                            181.99332                       147859.23   
2                            189.38850                       153867.40   
3                      

In [13]:
# Step 2: Create a filter based on the country (India in this case)
filtered_df = df[df['Entity'] == 'India']
print("\nFiltered DataFrame for India:")
print(filtered_df)





Filtered DataFrame for India:
      Entity Code  Year  Nitrogen oxide (NOx)  \
19475  India  IND  1750             30488.506   
19476  India  IND  1760             31085.959   
19477  India  IND  1770             31692.695   
19478  India  IND  1780             32308.560   
19479  India  IND  1790             32933.355   
...      ...  ...   ...                   ...   
19698  India  IND  2018           9818858.000   
19699  India  IND  2019           9464296.000   
19700  India  IND  2020           8575110.000   
19701  India  IND  2021           9056502.000   
19702  India  IND  2022           9350050.000   

       Sulphur dioxide (SO₂) emissions  Carbon monoxide (CO) emissions  \
19475                     3.481388e+04                       8360133.5   
19476                     3.543608e+04                       8509232.0   
19477                     3.606059e+04                       8658843.0   
19478                     3.668625e+04                       8808680.0   
19479     

In [14]:
# Step 3: Ensure the 'Nitrogen oxide (NOx)' column is numeric
filtered_df['Nitrogen oxide (NOx)'] = pd.to_numeric(filtered_df['Nitrogen oxide (NOx)'], errors='coerce')

# Calculate median, mean, and standard deviation for 'Nitrogen oxide (NOx)' and add them as new columns
filtered_df['NOx_median'] = filtered_df['Nitrogen oxide (NOx)'].median()
filtered_df['NOx_mean'] = filtered_df['Nitrogen oxide (NOx)'].mean()
filtered_df['NOx_std'] = filtered_df['Nitrogen oxide (NOx)'].std()

print("\nDataFrame with median, mean, and std columns added:")
print(filtered_df)





DataFrame with median, mean, and std columns added:
      Entity Code  Year  Nitrogen oxide (NOx)  \
19475  India  IND  1750             30488.506   
19476  India  IND  1760             31085.959   
19477  India  IND  1770             31692.695   
19478  India  IND  1780             32308.560   
19479  India  IND  1790             32933.355   
...      ...  ...   ...                   ...   
19698  India  IND  2018           9818858.000   
19699  India  IND  2019           9464296.000   
19700  India  IND  2020           8575110.000   
19701  India  IND  2021           9056502.000   
19702  India  IND  2022           9350050.000   

       Sulphur dioxide (SO₂) emissions  Carbon monoxide (CO) emissions  \
19475                     3.481388e+04                       8360133.5   
19476                     3.543608e+04                       8509232.0   
19477                     3.606059e+04                       8658843.0   
19478                     3.668625e+04                       8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Nitrogen oxide (NOx)'] = pd.to_numeric(filtered_df['Nitrogen oxide (NOx)'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['NOx_median'] = filtered_df['Nitrogen oxide (NOx)'].median()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['NOx_mean'] = 

In [15]:
# Step 4: Delete the repeated entries
# Assuming 'Year' uniquely identifies each row for the same country
df_cleaned = filtered_df.drop_duplicates(subset='Year')
print("\nDataFrame after removing duplicate entries:")
print(df_cleaned)




DataFrame after removing duplicate entries:
      Entity Code  Year  Nitrogen oxide (NOx)  \
19475  India  IND  1750             30488.506   
19476  India  IND  1760             31085.959   
19477  India  IND  1770             31692.695   
19478  India  IND  1780             32308.560   
19479  India  IND  1790             32933.355   
...      ...  ...   ...                   ...   
19698  India  IND  2018           9818858.000   
19699  India  IND  2019           9464296.000   
19700  India  IND  2020           8575110.000   
19701  India  IND  2021           9056502.000   
19702  India  IND  2022           9350050.000   

       Sulphur dioxide (SO₂) emissions  Carbon monoxide (CO) emissions  \
19475                     3.481388e+04                       8360133.5   
19476                     3.543608e+04                       8509232.0   
19477                     3.606059e+04                       8658843.0   
19478                     3.668625e+04                       8808680.0

In [16]:
# Step 5: Change the null values into 0
df_cleaned.fillna(0, inplace=True)
print("\nDataFrame after replacing NaN with 0:")
print(df_cleaned)


DataFrame after replacing NaN with 0:
      Entity Code  Year  Nitrogen oxide (NOx)  \
19475  India  IND  1750             30488.506   
19476  India  IND  1760             31085.959   
19477  India  IND  1770             31692.695   
19478  India  IND  1780             32308.560   
19479  India  IND  1790             32933.355   
...      ...  ...   ...                   ...   
19698  India  IND  2018           9818858.000   
19699  India  IND  2019           9464296.000   
19700  India  IND  2020           8575110.000   
19701  India  IND  2021           9056502.000   
19702  India  IND  2022           9350050.000   

       Sulphur dioxide (SO₂) emissions  Carbon monoxide (CO) emissions  \
19475                     3.481388e+04                       8360133.5   
19476                     3.543608e+04                       8509232.0   
19477                     3.606059e+04                       8658843.0   
19478                     3.668625e+04                       8808680.0   
19