# Proyek Analisis Data: Air Quality Dataset
## Nama: Malikus Syafaadi Nurfaza
## Email: malikussyafaadinurfaza@gmail.com

# Membuat Pertanyaan Bisnis
1. Bagaimana hubungan antara suhu, kelembaban, dan kualitas udara? 
2. Bagaimana distribusi konsentrasi PM2.5 dan PM10 selama periode waktu tertentu (harian, bulanan, atau tahunan)?
3. Apakah ada variasi musiman dalam kualitas udara? Bulan apa yang cenderung memiliki tingkat polusi tertinggi dan terendah?
4. Bagaimana perbedaan dalam kualitas udara pada jam sibuk dibandingkan jam non-sibuk (pagi dan malam)?

# Menyiapkan Library


In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# Data Wrangling
## Gathering Data

In [2]:
guanyuan_df = pd.read_csv('Data/PRSA_Data_Guanyuan_20130301-20170228.csv')
display(guanyuan_df)

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,14.0,20.0,300.0,69.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Guanyuan
1,2,2013,3,1,1,4.0,4.0,13.0,17.0,300.0,72.0,-1.1,1023.2,-18.2,0.0,N,4.7,Guanyuan
2,3,2013,3,1,2,3.0,3.0,10.0,19.0,300.0,69.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Guanyuan
3,4,2013,3,1,3,3.0,6.0,7.0,24.0,400.0,62.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Guanyuan
4,5,2013,3,1,4,3.0,6.0,5.0,14.0,400.0,71.0,-2.0,1025.2,-19.5,0.0,N,2.0,Guanyuan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35059,35060,2017,2,28,19,13.0,37.0,3.0,36.0,400.0,60.0,12.5,1013.5,-16.2,0.0,NW,2.4,Guanyuan
35060,35061,2017,2,28,20,20.0,43.0,4.0,48.0,500.0,43.0,11.6,1013.6,-15.1,0.0,WNW,0.9,Guanyuan
35061,35062,2017,2,28,21,16.0,33.0,5.0,39.0,500.0,50.0,10.8,1014.2,-13.3,0.0,NW,1.1,Guanyuan
35062,35063,2017,2,28,22,11.0,24.0,5.0,47.0,500.0,41.0,10.5,1014.4,-12.9,0.0,NNW,1.2,Guanyuan


# Assessing Data
## Menilai Data

In [4]:
BOLD = '\033[1m'
END = '\033[0m'

In [5]:
print(f"\nDataframe: {BOLD}guanyuan_df{END}")
guanyuan_df.info()
guanyuan_df.describe(include='all')


Dataframe: [1mguanyuan_df[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35064 entries, 0 to 35063
Data columns (total 18 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   No       35064 non-null  int64  
 1   year     35064 non-null  int64  
 2   month    35064 non-null  int64  
 3   day      35064 non-null  int64  
 4   hour     35064 non-null  int64  
 5   PM2.5    34448 non-null  float64
 6   PM10     34635 non-null  float64
 7   SO2      34590 non-null  float64
 8   NO2      34405 non-null  float64
 9   CO       33311 non-null  float64
 10  O3       33891 non-null  float64
 11  TEMP     35044 non-null  float64
 12  PRES     35044 non-null  float64
 13  DEWP     35044 non-null  float64
 14  RAIN     35044 non-null  float64
 15  wd       34983 non-null  object 
 16  WSPM     35050 non-null  float64
 17  station  35064 non-null  object 
dtypes: float64(11), int64(5), object(2)
memory usage: 4.8+ MB


Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
count,35064.0,35064.0,35064.0,35064.0,35064.0,34448.0,34635.0,34590.0,34405.0,33311.0,33891.0,35044.0,35044.0,35044.0,35044.0,34983,35050.0,35064
unique,,,,,,,,,,,,,,,,16,,1
top,,,,,,,,,,,,,,,,NE,,Guanyuan
freq,,,,,,,,,,,,,,,,5140,,35064
mean,17532.5,2014.66256,6.52293,15.729637,11.5,82.933372,109.023303,17.590941,57.901643,1271.294377,55.795044,13.584607,1011.84692,3.123062,0.067421,,1.708496,
std,10122.249256,1.177213,3.448752,8.800218,6.922285,80.933497,91.573709,23.600367,35.150857,1164.854945,57.436983,11.399097,10.404047,13.688896,0.910056,,1.204071,
min,1.0,2013.0,1.0,1.0,0.0,2.0,2.0,1.0,2.0,100.0,0.2142,-16.8,985.9,-35.3,0.0,,0.0,
25%,8766.75,2014.0,4.0,8.0,5.75,23.0,40.0,3.0,31.0,500.0,7.0,3.1,1003.3,-8.1,0.0,,0.9,
50%,17532.5,2015.0,7.0,16.0,11.5,59.0,89.0,8.0,51.0,900.0,41.0,14.5,1011.4,3.8,0.0,,1.4,
75%,26298.25,2016.0,10.0,23.0,17.25,115.0,149.0,22.0,78.0,1600.0,81.0,23.3,1020.1,15.6,0.0,,2.2,


## Null Check

In [6]:

print(f"\nDataframe: {BOLD}guanyuan_df{END}")
print(guanyuan_df.isna().sum())


Dataframe: [1mguanyuan_df[0m
No            0
year          0
month         0
day           0
hour          0
PM2.5       616
PM10        429
SO2         474
NO2         659
CO         1753
O3         1173
TEMP         20
PRES         20
DEWP         20
RAIN         20
wd           81
WSPM         14
station       0
dtype: int64


## Duplicates Check

In [7]:
print(f"\nDataframe: {BOLD}guanyuan_df{END}")
print("Jumlah data yang duplikat:", guanyuan_df.duplicated().sum())


Dataframe: [1mguanyuan_df[0m
Jumlah data yang duplikat: 0


## Describe Data

In [8]:
print(f"\nDataframe: {BOLD}guanyuan_df{END}")
print(guanyuan_df.describe(include='all'))


Dataframe: [1mguanyuan_df[0m
                  No          year         month           day          hour  \
count   35064.000000  35064.000000  35064.000000  35064.000000  35064.000000   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    17532.500000   2014.662560      6.522930     15.729637     11.500000   
std     10122.249256      1.177213      3.448752      8.800218      6.922285   
min         1.000000   2013.000000      1.000000      1.000000      0.000000   
25%      8766.750000   2014.000000      4.000000      8.000000      5.750000   
50%     17532.500000   2015.000000      7.000000     16.000000     11.500000   
75%     26298.250000   2016.000000     10.000000     23.000000     17.250000   
max     35064.000000   2017.000000     12.000000     31.000000     23.000000   

       