In [59]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
#Remember to add in the dates


In [33]:
train_set = pd.read_csv('Train.csv')
test_set = pd.read_csv('Test.csv')
samp_sub = pd.read_csv('SampleSubmission.csv')

In [34]:
train_set.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,010Q650 X 2020-01-02,2020-01-02,010Q650,38.0,23.0,53.0,769.5,92,11.0,60.200001,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,010Q650 X 2020-01-03,2020-01-03,010Q650,39.0,25.0,63.0,1319.85,91,14.6,48.799999,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,010Q650 X 2020-01-04,2020-01-04,010Q650,24.0,8.0,56.0,1181.96,96,16.4,33.400002,...,49.839714,-78.342701,34.296977,,,,,,,
3,010Q650 X 2020-01-05,2020-01-05,010Q650,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,...,29.181258,-73.896588,30.545446,,,,,,,
4,010Q650 X 2020-01-06,2020-01-06,010Q650,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,...,0.797294,-68.61248,26.899694,,,,,,,


## Data Cleaning


In [35]:
#Attempting to find the correlation between attributes
corr_mat = train_set.corr()
corr_mat["target"].sort_values(ascending=False)

  corr_mat = train_set.corr()


target                                             1.000000
target_min                                         0.818803
target_max                                         0.630801
L3_CO_CO_column_number_density                     0.375737
L3_HCHO_tropospheric_HCHO_column_number_density    0.347125
                                                     ...   
L3_HCHO_solar_azimuth_angle                       -0.181548
L3_O3_solar_azimuth_angle                         -0.185363
L3_CLOUD_solar_azimuth_angle                      -0.187643
L3_AER_AI_solar_azimuth_angle                     -0.193434
L3_AER_AI_sensor_altitude                         -0.308308
Name: target, Length: 79, dtype: float64

In [36]:
imputer = SimpleImputer(strategy="median")
train_set_copy = train_set.copy()
#dropping all text columns
train_set_num = train_set.select_dtypes(include=np.number).columns
target_columns = train_set_copy.filter(regex='target').columns

for column in train_set_copy:
    if column not in train_set_num:
        train_set_copy.drop(column, axis=1, inplace=True)

train_set_copy.head()

# target_columns

Unnamed: 0,target,target_min,target_max,target_variance,target_count,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,38.0,23.0,53.0,769.5,92,11.0,60.200001,0.00804,18.51684,1.996377,...,38.593017,-61.752587,22.363665,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118
1,39.0,25.0,63.0,1319.85,91,14.6,48.799999,0.00839,22.546533,3.33043,...,59.624912,-67.693509,28.614804,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652
2,24.0,8.0,56.0,1181.96,96,16.4,33.400002,0.0075,27.03103,5.065727,...,49.839714,-78.342701,34.296977,,,,,,,
3,49.0,10.0,55.0,1113.67,96,6.911948,21.300001,0.00391,23.971857,3.004001,...,29.181258,-73.896588,30.545446,,,,,,,
4,21.0,9.0,52.0,1164.82,95,13.900001,44.700001,0.00535,16.816309,2.621787,...,0.797294,-68.61248,26.899694,,,,,,,


In [37]:
#Remove all columns with target in the name
for column in target_columns:
    train_set_copy.drop(column, axis=1, inplace=True)



In [38]:
#fitting the imputer instance to the training data
imputer.fit(train_set_copy)
transformed = imputer.transform(train_set_copy)
train_set_copy = pd.DataFrame(transformed, columns=train_set_copy.columns)


In [39]:
def format_columns(dataset,dataset_copy):
    for column in dataset.columns:
        if column in train_set_copy.columns:
            dataset[column] = dataset_copy[column]

In [40]:
def format_dates(df):
    dates = pd.to_datetime(df["Date"])

    df["year"] = dates.dt.year.astype(np.int8)
    df["month"] = dates.dt.month.astype(np.int8)
    df["day"] = dates.dt.day.astype(np.int8)
    

In [41]:
format_columns(train_set,train_set_copy)
format_dates(train_set)

In [42]:
#Drop string columns
labels = train_set.target
train_set.drop(["Place_ID X Date","Date","Place_ID","target","target_min","target_max","target_variance","target_count"], axis=1, inplace=True)
train_set.head()

Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle,year,month,day
0,11.0,60.200001,0.00804,18.51684,1.996377,-1.227395,7.4e-05,0.000156,-1.23133,0.006507,...,1793.793579,3227.855469,0.010579,74.481049,37.501499,-62.142639,22.545118,-28,1,2
1,14.6,48.799999,0.00839,22.546533,3.33043,-1.188108,7.6e-05,0.000197,-1.082553,0.01836,...,1789.960449,3384.226562,0.015104,75.630043,55.657486,-53.868134,19.293652,-28,1,3
2,16.4,33.400002,0.0075,27.03103,5.065727,3.500559,6.7e-05,0.00017,-1.001242,0.015904,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,4
3,6.911948,21.300001,0.00391,23.971857,3.004001,1.099468,8.3e-05,0.000175,-0.777019,0.055765,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,5
4,13.900001,44.700001,0.00535,16.816309,2.621787,2.670559,7e-05,0.000142,0.366323,0.02853,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,6


In [43]:
#Making sure the training set has no missing values
# train_set.isnull().sum()
train_set.isnull().sum()

precipitable_water_entire_atmosphere    0
relative_humidity_2m_above_ground       0
specific_humidity_2m_above_ground       0
temperature_2m_above_ground             0
u_component_of_wind_10m_above_ground    0
                                       ..
L3_CH4_solar_azimuth_angle              0
L3_CH4_solar_zenith_angle               0
year                                    0
month                                   0
day                                     0
Length: 77, dtype: int64

In [44]:
# places = train_set.groupby('Place_ID')
x_train, x_test, y_train, y_test = train_test_split(train_set, labels, test_size=0.5, random_state=42)

## Selecting and Training models

In [45]:
line_reg = LinearRegression()
line_reg.fit(x_train, y_train)

In [46]:
predictions = line_reg.predict(x_test)
line_reg_rmse = np.sqrt(mse(y_test, predictions))
print(f"RMSE: {line_reg_rmse}")

RMSE: 37.271893009235384


In [47]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(x_train,y_train)

In [48]:
predictions = dt_reg.predict(x_test)
dt_reg_rmse = np.sqrt(mse(y_test,predictions))
print(f"RMSE: {dt_reg_rmse}")

RMSE: 45.2155112792009


In [49]:
random_reg = RandomForestRegressor()
random_reg.fit(x_train,y_train)

In [50]:
preds = random_reg.predict(x_test)
random_reg_rmse = np.sqrt(mse(y_test,preds))
print(f"RMSE: {random_reg_rmse}")

RMSE: 31.209564511394618


In [60]:
extra_trees = ExtraTreesRegressor()
extra_trees.fit(x_train,y_train)

In [61]:
preds = extra_trees.predict(x_test)
extra_trees_rmse = np.sqrt(mse(y_test,preds))
print(f"RMSE: {extra_trees_rmse}")

RMSE: 30.030066510711194


## Prepare the test set

In [62]:
test_set.head()

Unnamed: 0,Place_ID X Date,Date,Place_ID,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,...,L3_SO2_sensor_zenith_angle,L3_SO2_solar_azimuth_angle,L3_SO2_solar_zenith_angle,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle
0,0OS9LVX X 2020-01-02,2020-01-02,0OS9LVX,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,...,1.445658,-95.984984,22.942019,,,,,,,
1,0OS9LVX X 2020-01-03,2020-01-03,0OS9LVX,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,...,34.641758,-95.014908,18.539116,,,,,,,
2,0OS9LVX X 2020-01-04,2020-01-04,0OS9LVX,17.6,41.299999,0.0059,15.511041,5.245728,1.640559,5e-05,...,55.872276,-94.015418,14.14082,,,,,,,
3,0OS9LVX X 2020-01-05,2020-01-05,0OS9LVX,15.011948,53.100002,0.00709,14.441858,5.454001,-0.190532,5.5e-05,...,59.174188,-97.247602,32.730553,,,,,,,
4,0OS9LVX X 2020-01-06,2020-01-06,0OS9LVX,9.7,71.599998,0.00808,11.896295,3.511787,-0.279441,5.5e-05,...,40.925873,-96.057265,28.320527,1831.261597,3229.118652,0.031068,-100.278343,41.84708,-95.910744,28.498789


In [63]:
temp_test = test_set.copy()
temp_dates = pd.DataFrame(temp_test["Date"])
temp_test.drop(["Place_ID X Date","Date","Place_ID"], axis=1, inplace=True)
temp_transformed = imputer.transform(temp_test)
temp_test_copy = pd.DataFrame(temp_transformed, columns=temp_test.columns)
format_columns(temp_test,temp_test_copy)

dates = pd.to_datetime(test_set["Date"])

temp_test["year"] = dates.dt.year.astype(np.int8)
temp_test["month"] = dates.dt.month.astype(np.int8)
temp_test["day"] = dates.dt.day.astype(np.int8)
    








temp_test.head()

Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle,year,month,day
0,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,0.000108,0.466171,0.010752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,2
1,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,0.000109,-0.213659,0.028307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,3
2,17.6,41.299999,0.0059,15.511041,5.245728,1.640559,5e-05,0.000134,-0.25425,0.010374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,4
3,15.011948,53.100002,0.00709,14.441858,5.454001,-0.190532,5.5e-05,0.000155,-0.26849,0.088795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,5
4,9.7,71.599998,0.00808,11.896295,3.511787,-0.279441,5.5e-05,0.000131,0.46072,0.041197,...,1831.261597,3229.118652,0.031068,-100.278343,41.84708,-95.910744,28.498789,-28,1,6


In [64]:
temp_test.head()

Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,L3_CH4_CH4_column_volume_mixing_ratio_dry_air,L3_CH4_aerosol_height,L3_CH4_aerosol_optical_depth,L3_CH4_sensor_azimuth_angle,L3_CH4_sensor_zenith_angle,L3_CH4_solar_azimuth_angle,L3_CH4_solar_zenith_angle,year,month,day
0,11.6,30.200001,0.00409,14.656824,3.956377,0.712605,5.3e-05,0.000108,0.466171,0.010752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,2
1,18.300001,42.900002,0.00595,15.026544,4.23043,0.661892,5e-05,0.000109,-0.213659,0.028307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,3
2,17.6,41.299999,0.0059,15.511041,5.245728,1.640559,5e-05,0.000134,-0.25425,0.010374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,4
3,15.011948,53.100002,0.00709,14.441858,5.454001,-0.190532,5.5e-05,0.000155,-0.26849,0.088795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28,1,5
4,9.7,71.599998,0.00808,11.896295,3.511787,-0.279441,5.5e-05,0.000131,0.46072,0.041197,...,1831.261597,3229.118652,0.031068,-100.278343,41.84708,-95.910744,28.498789,-28,1,6


## Predictions for test set

In [65]:
test_predictions = extra_trees.predict(temp_test)

In [66]:
final = pd.DataFrame()
final["Place_ID X Date"] = test_set["Place_ID X Date"]
final["target"] = test_predictions

final.to_csv("urban_pollution_extra_trees.csv", index=False)