---

In [1]:
# Import dependencies
import pandas as pd
import numpy as np 
from math import exp
from pathlib import Path
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Start a SparkSession
import findspark
findspark.init()

In [2]:
#import dataset that will be used 
url = "https://project-4-group-6-air-quality.s3.us-east-2.amazonaws.com/data_drop2.csv"
air_data_df= pd.read_csv(url)
air_data_df.head()

Unnamed: 0,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM
0,2013,3,1,0,9.0,9.0,3.0,17.0,300.0,89.0,-0.5,1024.5,-21.4,0.0,NNW,5.7
1,2013,3,1,1,4.0,4.0,3.0,16.0,300.0,88.0,-0.7,1025.1,-22.1,0.0,NW,3.9
2,2013,3,1,5,4.0,4.0,9.0,25.0,300.0,78.0,-2.4,1027.5,-21.3,0.0,NW,2.4
3,2013,3,1,6,5.0,5.0,10.0,29.0,400.0,67.0,-2.5,1028.2,-20.4,0.0,NW,2.2
4,2013,3,1,7,3.0,6.0,12.0,40.0,400.0,52.0,-1.4,1029.5,-20.4,0.0,NNW,3.0


In [3]:
#Drop unecessary columns for model 
air_data_df.drop(["wd","PM2.5","PM10","SO2","NO2","CO","TEMP","PRES","DEWP","RAIN","WSPM"],axis=1,inplace=True) 

#Show the DataFrame
air_data_df.head()

Unnamed: 0,year,month,day,hour,O3
0,2013,3,1,0,89.0
1,2013,3,1,1,88.0
2,2013,3,1,5,78.0
3,2013,3,1,6,67.0
4,2013,3,1,7,52.0


In [4]:
#Set X and eliminate target variable
X = air_data_df.copy()
X.drop("O3",axis = 1, inplace=True)

#Show the first 5 rows of the DataFrame 
X.head(5)

Unnamed: 0,year,month,day,hour
0,2013,3,1,0
1,2013,3,1,1
2,2013,3,1,5
3,2013,3,1,6
4,2013,3,1,7


In [5]:
#Set y (target variable)
y = air_data_df["O3"].array.reshape(-1,1)

#Show the first 5 rows of the data
y[:5]

<NumpyExtensionArray>
[
[89.0],
[88.0],
[78.0],
[67.0],
[52.0]
]
Shape: (5, 1), dtype: float64

In [6]:
#split into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

#Create a StandardScalar instance
scaler = StandardScaler()

# Fitting standard scaler 
X_scaler = scaler.fit(X_train)

In [7]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)

X_test_scaled = X_scaler.transform(X_test)

In [8]:
#create Regressor object
model = DecisionTreeRegressor(random_state = 0)

#fit regressor with X and y data 
model.fit(X_train_scaled,y_train)

In [9]:
#Predict y values using the scaled testing data
y_pred = model.predict(X_test_scaled)

# Calculating the R^2 value https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
r2 = model.score(X_test_scaled, y_test)

print(f"The R-squared value of the model is: {r2}")

The R-squared value of the model is: 0.8711671175362243


# Additional Infomation
   
    Data from S3 bucket is temporary, for data please refer to data_drop2.csv

    

# Other models attempts during the optimization process: 


In [10]:
#Previously attempted data inputs and their associated R-squared (R2) values. 

#air_data_df.drop(["wd"],axis=1,inplace=True) 
    #(R2 = 0.83) --> ran model with all columns containing numerical values 

#air_data_df.drop(["wd","year","month","day","hour",,axis=1,inplace=True)
    #(R2 = 0.67) --> evaluated chemical compounds and weather variables as predictors of O3

#air_data_df.drop(["wd","year","month","day","hour","PM2.5","PM10","SO2","NO2","CO"],axis=1,inplace=True) 
    #(R2 =0.63) --> evaluated weather variables as predictor of O3

#air_data_df.drop(["wd","TEMP","PRES","DEWP","RAIN","WSPM","year","month","day","hour"],axis=1,inplace=True)
    #(R2 = 0.03) --> evaluated other chemical compounds as predictor of O3

#air_data_df.drop(["wd","PM2.5","PM10","SO2","NO2","CO","PRES","DEWP","RAIN","WSPM"],axis=1,inplace=True) 
    #(R2 = 0.84) --> evaluated date and temperature as predictor of ozone


In [11]:
##Attempt to see if calculating relative humidity optimizes model --> R2 = 0.82
  #formula obtained from https://bmcnoldy.earth.miami.edu/Humidity.html retrieved 12.11.2023

#def relative_humidity(temp, dewp):
  #rh =100*(exp((17.625*dewp)/(243.04+dewp))/exp((17.625*temp)/(243.04+temp)))
  #return rh


#air_data_df.apply(lambda row: relative_humidity(row['TEMP'], row['DEWP']), axis="columns")
#relative_humidity(drop_df['TEMP'], drop_df['DEWP'])

#air_data_df['RelHum'] = air_data_df.apply(lambda row: relative_humidity(row['TEMP'], row['DEWP']), axis="columns")