In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [43]:
def wrangle(filepath):
    data = pd.read_csv(filepath)

    data['device_name'] = data['device_name'].map({'alpha': 0, 'beta': 1, 'charlie': 2})
    data.drop('ID', axis = 1, inplace = True)

    numerical_columns = ['Temperature', 'Humidity', 'MQ7_analog', 'MQ9_analog', 'MG811_analog', 'MQ135_analog']
    scaler = StandardScaler()
    transformer = ColumnTransformer([('scaler', scaler, numerical_columns)], remainder = 'passthrough')
    data = transformer.fit_transform(data)
    data = pd.DataFrame(data, columns = ['Temperature', 'Humidity', 'MQ7_analog',
                                         'MQ9_analog', 'MG811_analog', 'MQ135_analog',
                                         'device_name'])

    return data

In [17]:
data_transformed = wrangle('air-quality-prediction/Train.csv')
data_transformed.head()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name,CO2
0,-0.350446,0.673496,-1.613758,-0.311878,-2.205518,-1.6206,0.0,585.75
1,1.099981,-0.426862,-0.385112,-0.77981,0.136414,-0.296394,0.0,613.0
2,0.98841,-1.321636,-1.30867,-1.504623,0.231985,-0.824174,0.0,616.5
3,0.941302,-1.431327,-1.278254,-1.483736,0.286596,-0.807943,0.0,642.5
4,0.995848,-1.043614,-0.987453,-1.780831,1.387477,-2.491464,0.0,622.0


In [11]:
data = pd.read_csv('air-quality-prediction/Train.csv')
data.head()

Unnamed: 0,ID,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name,CO2
0,ID_000001,28.975,74.475,2480.0,3476.5,1572.0,1997.0,alpha,585.75
1,ID_000002,31.9,66.5,3813.0,2726.0,4145.0,3180.0,alpha,613.0
2,ID_000003,31.675,60.015,2811.0,1563.5,4250.0,2708.5,alpha,616.5
3,ID_000004,31.58,59.22,2844.0,1597.0,4310.0,2723.0,alpha,642.5
4,ID_000005,31.69,62.03,3159.5,1120.5,5519.5,1219.0,alpha,622.0


In [4]:
data['device_name'].value_counts()

device_name
beta       2485
charlie    2431
alpha      2391
Name: count, dtype: int64

In [7]:
data.isnull().sum()

ID              0
Temperature     0
Humidity        0
MQ7_analog      0
MQ9_analog      0
MG811_analog    0
MQ135_analog    0
device_name     0
CO2             0
dtype: int64

In [19]:
# Split the dataset into train and test sets
X = data_transformed.drop('CO2', axis = 1)
y = data_transformed['CO2']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 42)

In [None]:
# Fit a model
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations = 2000,
                          depth = 16,
                          loss_function = 'RMSE',
                        verbose = 10)

model.fit(X_train, y_train)

Learning rate set to 0.030803
0:	learn: 16.3258864	total: 4.24s	remaining: 2h 21m 19s
10:	learn: 13.9332035	total: 33.9s	remaining: 1h 42m 14s
20:	learn: 12.1741498	total: 52.4s	remaining: 1h 22m 21s
30:	learn: 10.8936910	total: 1m 5s	remaining: 1h 8m 50s
40:	learn: 9.8871078	total: 1m 20s	remaining: 1h 3m 53s
50:	learn: 9.1588895	total: 1m 36s	remaining: 1h 1m 21s
60:	learn: 8.5201382	total: 1m 54s	remaining: 1h 47s
70:	learn: 8.0487936	total: 2m 15s	remaining: 1h 1m 14s
80:	learn: 7.6611591	total: 2m 36s	remaining: 1h 1m 56s
90:	learn: 7.3382005	total: 3m 2s	remaining: 1h 3m 42s
100:	learn: 7.0751899	total: 3m 21s	remaining: 1h 3m 15s
110:	learn: 6.8475950	total: 3m 38s	remaining: 1h 1m 58s
120:	learn: 6.6432491	total: 3m 59s	remaining: 1h 1m 53s
130:	learn: 6.4654571	total: 4m 20s	remaining: 1h 1m 51s
140:	learn: 6.3089986	total: 4m 38s	remaining: 1h 1m 13s
150:	learn: 6.1672172	total: 5m 4s	remaining: 1h 2m 12s
160:	learn: 6.0463761	total: 5m 22s	remaining: 1h 1m 24s
170:	learn: 5.

In [57]:
# Evaluate the model
y_pred = model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

2.92515162733783
23.65320827627451
0.9097044009055397


In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators = 200, random_state = 42)

# Train the model on the training data
rf_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
rf_mse = mean_squared_error(y_test, y_pred)
rf_mae = mean_absolute_error(y_test, y_pred)
rf_r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Mean Squared Error: {rf_mse}')
print(f'Mean Absolute Error: {rf_mae}')
print(f'R-squared: {rf_r2}')

Mean Squared Error: 25.396059274072922
Mean Absolute Error: 2.8630367077063346
R-squared: 0.9030511057947688


In [34]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state = 42)

# Train the model on the training data
dt_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = dt_regressor.predict(X_test)

# Evaluate the model
dt_mse = mean_squared_error(y_test, y_pred)
dt_mae = mean_absolute_error(y_test, y_pred)
dt_r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f'Mean Squared Error: {dt_mse}')
print(f'Mean Absolute Error: {dt_mae}')
print(f'R-squared: {dt_r2}')

Mean Squared Error: 51.40647701834916
Mean Absolute Error: 3.6448938772126187
R-squared: 0.8037569117267105


In [40]:
y_pred

array([590.40341013, 585.37335082, 613.72314345, ..., 603.79928339,
       597.63150343, 597.93874963])

In [44]:
testdata = wrangle('air-quality-prediction/test.csv')
testdata.head()

Unnamed: 0,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name
0,1.378004,-1.312404,-0.635062,-1.586199,-1.209551,-1.965257,0.0
1,0.986613,-1.499966,-1.228633,-1.41208,0.479136,0.183901,0.0
2,1.031486,-1.449949,-1.241154,-1.43954,0.41861,-0.798412,0.0
3,1.083837,-0.35931,-0.310916,-0.657252,0.213286,-0.417083,0.0
4,-0.04297,0.511117,-1.560199,-0.181077,-2.290645,-1.435076,0.0


In [47]:
test_data_original = pd.read_csv('air-quality-prediction/test.csv')
test_data_original.head()

Unnamed: 0,ID,Temperature,Humidity,MQ7_analog,MQ9_analog,MG811_analog,MQ135_analog,device_name
0,ID_007308,32.45,59.99,3504.0,1380.0,2642.5,1637.0,alpha
1,ID_007309,31.665,58.64,2864.0,1659.0,4456.0,3564.5,alpha
2,ID_007310,31.755,59.0,2850.5,1615.0,4391.0,2683.5,alpha
3,ID_007311,31.86,66.85,3853.5,2868.5,4170.5,3025.5,alpha
4,ID_007312,29.6,73.115,2506.5,3631.5,1481.5,2112.5,alpha


In [58]:
test_predictions = model.predict(testdata)

In [59]:
# Save predictions
predctions = pd.DataFrame({'ID': test_data_original['ID'], 'CO2': test_predictions})
predctions.to_csv('Predicted-Air-Quality-Dataset4.csv', index= False)