### **Mounting the google drive to the colab environment:**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
%cd '/content/drive/MyDrive/Assessment_Final_Data'


/content/drive/MyDrive/Assessment_Final_Data


# TASK 1: DATA HANDLING

## LOADING THE DATASET


In [6]:
%ls

combined_output.csv
PRSA_Data_Dingling_20130301-20170228.csv
PRSA_Data_Shunyi_20130301-20170228.csv
PRSA_Data_Tiantan_20130301-20170228.csv
PRSA_Data_Wanliu_20130301-20170228.csv
PRSA_Data_Wanshouxigong_20130301-20170228.csv


In [7]:
import pandas as pd
import glob

# Get all CSV files in the folder
csv_files = glob.glob("/content/drive/MyDrive/Assessment_Final_Data/*.csv")
csv_files



['/content/drive/MyDrive/Assessment_Final_Data/PRSA_Data_Dingling_20130301-20170228.csv',
 '/content/drive/MyDrive/Assessment_Final_Data/PRSA_Data_Wanshouxigong_20130301-20170228.csv',
 '/content/drive/MyDrive/Assessment_Final_Data/PRSA_Data_Tiantan_20130301-20170228.csv',
 '/content/drive/MyDrive/Assessment_Final_Data/PRSA_Data_Shunyi_20130301-20170228.csv',
 '/content/drive/MyDrive/Assessment_Final_Data/PRSA_Data_Wanliu_20130301-20170228.csv',
 '/content/drive/MyDrive/Assessment_Final_Data/combined_output.csv']

## COMBINING ALL THE DATASETS TO ONE DATAFRAME

In [8]:
# Read and combine all CSV files
combined_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)
combined_df

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,3.0,,200.0,82.0,-2.3,1020.8,-19.7,0.0,E,0.5,Dingling
1,2,2013,3,1,1,7.0,7.0,3.0,,200.0,80.0,-2.5,1021.3,-19.0,0.0,ENE,0.7,Dingling
2,3,2013,3,1,2,5.0,5.0,3.0,2.0,200.0,79.0,-3.0,1021.3,-19.9,0.0,ENE,0.2,Dingling
3,4,2013,3,1,3,6.0,6.0,3.0,,200.0,79.0,-3.6,1021.8,-19.1,0.0,NNE,1.0,Dingling
4,5,2013,3,1,4,5.0,5.0,3.0,,200.0,81.0,-3.5,1022.3,-19.4,0.0,N,2.1,Dingling
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350635,35060,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong
350636,35061,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong
350637,35062,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong
350638,35063,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong


In [9]:

# Save to a new CSV file
combined_df.to_csv("combined_output.csv", index=False)

print("All CSV files have been merged into 'combined_output.csv'.")

All CSV files have been merged into 'combined_output.csv'.


# TASK2 : EDA(EXPLORATORY DATA ANALYSIS)

## A) FUNDAMENTAL DATA UNDERSTANDING

In [10]:
combined_df.shape

(350640, 18)

In [12]:
combined_df.columns

Index(['No', 'year', 'month', 'day', 'hour', 'PM2.5', 'PM10', 'SO2', 'NO2',
       'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station'],
      dtype='object')

In [11]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350640 entries, 0 to 350639
Data columns (total 18 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   No       350640 non-null  int64  
 1   year     350640 non-null  int64  
 2   month    350640 non-null  int64  
 3   day      350640 non-null  int64  
 4   hour     350640 non-null  int64  
 5   PM2.5    343746 non-null  float64
 6   PM10     345502 non-null  float64
 7   SO2      341864 non-null  float64
 8   NO2      340306 non-null  float64
 9   CO       333790 non-null  float64
 10  O3       337178 non-null  float64
 11  TEMP     350314 non-null  float64
 12  PRES     350320 non-null  float64
 13  DEWP     350308 non-null  float64
 14  RAIN     350318 non-null  float64
 15  wd       348834 non-null  object 
 16  WSPM     350384 non-null  float64
 17  station  350640 non-null  object 
dtypes: float64(11), int64(5), object(2)
memory usage: 48.2+ MB


In [13]:
selected_columns = ['No', 'year', 'month', 'day', 'hour', 'PM2.5', 'PM10', 'SO2', 'NO2','CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station']
df1 = combined_df[selected_columns]
df1

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,3.0,,200.0,82.0,-2.3,1020.8,-19.7,0.0,E,0.5,Dingling
1,2,2013,3,1,1,7.0,7.0,3.0,,200.0,80.0,-2.5,1021.3,-19.0,0.0,ENE,0.7,Dingling
2,3,2013,3,1,2,5.0,5.0,3.0,2.0,200.0,79.0,-3.0,1021.3,-19.9,0.0,ENE,0.2,Dingling
3,4,2013,3,1,3,6.0,6.0,3.0,,200.0,79.0,-3.6,1021.8,-19.1,0.0,NNE,1.0,Dingling
4,5,2013,3,1,4,5.0,5.0,3.0,,200.0,81.0,-3.5,1022.3,-19.4,0.0,N,2.1,Dingling
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350635,35060,2017,2,28,19,11.0,32.0,3.0,24.0,400.0,72.0,12.5,1013.5,-16.2,0.0,NW,2.4,Wanshouxigong
350636,35061,2017,2,28,20,13.0,32.0,3.0,41.0,500.0,50.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Wanshouxigong
350637,35062,2017,2,28,21,14.0,28.0,4.0,38.0,500.0,54.0,10.8,1014.2,-13.3,0.0,NW,1.1,Wanshouxigong
350638,35063,2017,2,28,22,12.0,23.0,4.0,30.0,400.0,59.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Wanshouxigong


In [19]:
# Missing values
def missing_values_table(df1):
        # Total missing values
        mis_val = df1.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df1.isnull().sum() / len(df1)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        print(mis_val_table)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns.sort_values(
        '% of Total Values', ascending=False)

        # Return the dataframe with missing information
        return mis_val_table_ren_columns

missing_values= missing_values_table(df1)
missing_values.style.background_gradient(cmap='RdYlGn_r')

             0         1
No           0  0.000000
year         0  0.000000
month        0  0.000000
day          0  0.000000
hour         0  0.000000
PM2.5     6894  1.966119
PM10      5138  1.465321
SO2       8776  2.502852
NO2      10334  2.947182
CO       16850  4.805499
O3       13462  3.839265
TEMP       326  0.092973
PRES       320  0.091262
DEWP       332  0.094684
RAIN       322  0.091832
wd        1806  0.515058
WSPM       256  0.073009
station      0  0.000000


Unnamed: 0,Missing Values,% of Total Values
CO,16850,4.805499
O3,13462,3.839265
NO2,10334,2.947182
SO2,8776,2.502852
PM2.5,6894,1.966119
PM10,5138,1.465321
wd,1806,0.515058
DEWP,332,0.094684
TEMP,326,0.092973
RAIN,322,0.091832


In [20]:
df1.describe()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,WSPM
count,350640.0,350640.0,350640.0,350640.0,350640.0,343746.0,345502.0,341864.0,340306.0,333790.0,337178.0,350314.0,350320.0,350308.0,350318.0,350384.0
mean,17532.5,2014.66256,6.52293,15.729637,11.5,79.222937,102.332299,15.053055,49.138538,1217.253369,57.010496,13.591802,1011.196055,2.478606,0.063614,1.753711
std,10122.119349,1.177198,3.448708,8.800105,6.922196,80.879994,90.558262,20.710854,35.251708,1163.806797,56.389803,11.408911,10.485373,13.760044,0.800576,1.247104
min,1.0,2013.0,1.0,1.0,0.0,2.0,2.0,0.2856,1.0265,100.0,0.2142,-16.8,982.4,-36.0,0.0,0.0
25%,8766.75,2014.0,4.0,8.0,5.75,20.0,35.0,2.0,21.0,500.0,10.0,3.2,1002.7,-8.9,0.0,0.9
50%,17532.5,2015.0,7.0,16.0,11.5,54.0,80.0,7.0,42.0,900.0,45.0,14.6,1010.9,3.1,0.0,1.4
75%,26298.25,2016.0,10.0,23.0,17.25,111.0,142.0,18.0,70.0,1500.0,82.0,23.3,1019.5,15.1,0.0,2.2
max,35064.0,2017.0,12.0,31.0,23.0,999.0,999.0,411.0,264.0,10000.0,674.0,41.4,1042.8,28.8,72.5,13.2
