In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

Create Wrangle function

In [2]:
def wrangle(filepath):
  df=pd.read_csv(filepath)

  place_date = df['Place_ID X Date'].copy()

  df = df.drop(columns=['Place_ID X Date','Place_ID'])

  # Dropping columns with more than 50% missing values
  missing_val_per=df.isna().sum()/len(df)
  columns_to_drop=missing_val_per[missing_val_per>=0.5].index
  df.drop(columns=columns_to_drop,inplace=True)

  # Dropping columns that leak info about the target value
  df.drop(columns=["target_min","target_max","target_variance","target_count"], errors='ignore',inplace=True)

  # Dropping columns that have high collinearity
  df.drop(columns=['L3_HCHO_sensor_zenith_angle', 'L3_O3_sensor_azimuth_angle', 'L3_AER_AI_sensor_zenith_angle', 'L3_CLOUD_solar_azimuth_angle', 'L3_SO2_solar_azimuth_angle', 'L3_O3_cloud_fraction', 'L3_CO_sensor_zenith_angle', 'L3_NO2_sensor_azimuth_angle', 'L3_SO2_sensor_zenith_angle', 'L3_CLOUD_cloud_top_height', 'L3_CO_sensor_azimuth_angle', 'L3_O3_solar_zenith_angle', 'L3_CLOUD_sensor_azimuth_angle', 'L3_AER_AI_sensor_azimuth_angle', 'L3_HCHO_cloud_fraction', 'L3_SO2_solar_zenith_angle', 'L3_NO2_tropospheric_NO2_column_number_density', 'L3_CLOUD_sensor_zenith_angle', 'L3_CLOUD_solar_zenith_angle', 'L3_HCHO_sensor_azimuth_angle', 'L3_CLOUD_cloud_base_height', 'L3_CO_solar_zenith_angle', 'L3_O3_solar_azimuth_angle', 'L3_CLOUD_cloud_base_pressure', 'L3_HCHO_solar_zenith_angle'], inplace=True)

  return df

In [3]:
train_data=wrangle("/content/Train.csv")

Dropping columns with >50% missing values

In [4]:
# train_data.isna().sum().sort_values(ascending=False)
# len(train_data)
missing_val_per=train_data.isna().sum()/len(train_data)
missing_val_per.tail(30)
columns_to_drop=missing_val_per[missing_val_per>=0.5]
columns_to_drop

Unnamed: 0,0


In [5]:
# Creating a customer transformer for the date column
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X['Date'] = pd.to_datetime(X['Date'])


        X['year'] = X['Date'].dt.year
        X['month'] = X['Date'].dt.month
        X['dayofweek'] = X['Date'].dt.dayofweek
        X['days_since_start'] = (X['Date'] - X['Date'].min()).dt.days

        # Optionally drop the original 'Date' column and drop the year column because it is the same year all through(low-cardinality)
        X = X.drop(columns=['Date', "year", "days_since_start"])

        return X


In [6]:
X_train= train_data.drop('target', axis=1)
y_train = train_data['target']

In [7]:
# Initializing the transformer
date_feature_extractor = DateFeatureExtractor()
X_train_transformed = date_feature_extractor.transform(X_train)
X_train_transformed.head()


Unnamed: 0,precipitable_water_entire_atmosphere,relative_humidity_2m_above_ground,specific_humidity_2m_above_ground,temperature_2m_above_ground,u_component_of_wind_10m_above_ground,v_component_of_wind_10m_above_ground,L3_NO2_NO2_column_number_density,L3_NO2_NO2_slant_column_number_density,L3_NO2_absorbing_aerosol_index,L3_NO2_cloud_fraction,...,L3_AER_AI_solar_azimuth_angle,L3_AER_AI_solar_zenith_angle,L3_SO2_SO2_column_number_density,L3_SO2_SO2_column_number_density_amf,L3_SO2_SO2_slant_column_number_density,L3_SO2_absorbing_aerosol_index,L3_SO2_cloud_fraction,L3_SO2_sensor_azimuth_angle,month,dayofweek
0,11.0,60.200001,0.00804,18.51684,1.996377,-1.227395,7.4e-05,0.000156,-1.23133,0.006507,...,-61.736719,22.358167,-0.000127,0.312521,-4e-05,-1.861476,0.0,76.536426,1,3
1,14.6,48.799999,0.00839,22.546533,3.33043,-1.188108,7.6e-05,0.000197,-1.082553,0.01836,...,-67.693509,28.614804,0.00015,0.433957,5e-05,-1.452612,0.059433,-14.708036,1,4
2,16.4,33.400002,0.0075,27.03103,5.065727,3.500559,6.7e-05,0.00017,-1.001242,0.015904,...,-78.342701,34.296977,0.00015,0.356925,5.3e-05,-1.57295,0.082063,-105.201338,1,5
3,6.911948,21.300001,0.00391,23.971857,3.004001,1.099468,8.3e-05,0.000175,-0.777019,0.055765,...,-73.896572,30.545393,0.000227,0.584522,0.00011,-1.239317,0.121261,-104.334066,1,6
4,13.900001,44.700001,0.00535,16.816309,2.621787,2.670559,7e-05,0.000142,0.366323,0.02853,...,-68.61248,26.899694,0.00039,0.408047,0.000159,0.202489,0.037919,58.850179,1,0


Copying Place_ID x Date for submission

In [9]:
test_raw = pd.read_csv("/content/Test.csv")
place_date = test_raw["Place_ID X Date"].copy()
place_date

Unnamed: 0,Place_ID X Date
0,0OS9LVX X 2020-01-02
1,0OS9LVX X 2020-01-03
2,0OS9LVX X 2020-01-04
3,0OS9LVX X 2020-01-05
4,0OS9LVX X 2020-01-06
...,...
16131,ZZDJZMS X 2020-03-31
16132,ZZDJZMS X 2020-04-01
16133,ZZDJZMS X 2020-04-02
16134,ZZDJZMS X 2020-04-03


In [10]:
test_data = wrangle("/content/Test.csv")
X_test_transformed = date_feature_extractor.transform(test_data)
len(X_test_transformed.columns)

44

Check for collinearity

In [11]:
corr_matrix = X_train_transformed.select_dtypes(include='number').corr().abs()

upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

high_corr_pairs = upper_tri.stack().sort_values(ascending=False).to_frame(name="correlation")


print("Top 30 most correlated pairs:")
print(high_corr_pairs.head(30))

Top 30 most correlated pairs:
                                                                                                    correlation
L3_HCHO_HCHO_slant_column_number_density           L3_HCHO_tropospheric_HCHO_column_number_density     0.894417
precipitable_water_entire_atmosphere               specific_humidity_2m_above_ground                   0.882341
L3_CLOUD_cloud_fraction                            L3_SO2_cloud_fraction                               0.877950
L3_NO2_sensor_zenith_angle                         L3_O3_sensor_zenith_angle                           0.858282
L3_NO2_cloud_fraction                              L3_CLOUD_cloud_fraction                             0.839613
L3_SO2_SO2_column_number_density                   L3_SO2_SO2_slant_column_number_density              0.831505
L3_NO2_solar_zenith_angle                          L3_AER_AI_solar_zenith_angle                        0.826709
L3_O3_O3_effective_temperature                     L3_CO_sensor_altitude  

**Explanation of the Code Cell above**



1.   np.ones was to convert the array into 1s

2.   np.triu was to convert the ones into an upper triangle(upper triangle such that everything below the leading diagonal is 0) and this is to avoid repetition.

3.   The "k=1" inside np.triu is equally to convert the leading diagonal to 0 because the leading diagonal is bascially correlation of a feature with itself and we don't need that.

4.  upper_tri.stack() is to flatten the upper triangle into a Series

5.  .to_frame(name="correlation") is to convert our series(gotten from upper_tri.stack()) back to a DataFrame


In [12]:
# Filter for highly correlated pairs
high_corr_pairs = high_corr_pairs[high_corr_pairs["correlation"] > 0.9]
# Combine target temporarily for correlation
X_temp = X_train_transformed.copy()
X_temp["target"] = y_train

# Compute correlation of features with target
feature_target_corr = X_temp.select_dtypes(include='number').corr()["target"].abs().drop("target")

features_to_drop = set()

for feature1, feature2 in high_corr_pairs.index:
    if feature1 in features_to_drop or feature2 in features_to_drop:
        continue
    if feature_target_corr[feature1] >= feature_target_corr[feature2]:
        features_to_drop.add(feature2)
    else:
        features_to_drop.add(feature1)

print("Features to drop due to high correlation:", features_to_drop)


Features to drop due to high correlation: set()


In [13]:
X_train_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30557 entries, 0 to 30556
Data columns (total 44 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   precipitable_water_entire_atmosphere                 30557 non-null  float64
 1   relative_humidity_2m_above_ground                    30557 non-null  float64
 2   specific_humidity_2m_above_ground                    30557 non-null  float64
 3   temperature_2m_above_ground                          30557 non-null  float64
 4   u_component_of_wind_10m_above_ground                 30557 non-null  float64
 5   v_component_of_wind_10m_above_ground                 30557 non-null  float64
 6   L3_NO2_NO2_column_number_density                     28368 non-null  float64
 7   L3_NO2_NO2_slant_column_number_density               28368 non-null  float64
 8   L3_NO2_absorbing_aerosol_index                       28368 non-nul

In [14]:
preprocessor=preprocessor = Pipeline(steps=[
    ("date_features", DateFeatureExtractor()),
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


In [15]:
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

In [16]:
model_pipeline.fit(X_train, y_train)
predictions = model_pipeline.predict(test_data)


In [17]:
predictions

array([64.77100759, 45.8425694 , 42.34147917, ..., 31.10717808,
       59.32297952, 69.51578169])

In [20]:
sub=pd.read_csv("/content/SampleSubmission (1).csv")
sub["target"]=predictions
sub["Place_ID X Date"]=place_date
sub.to_csv("Urban_air_poll_submission.csv",index=False)