In [35]:
# data reading:
import pandas as pd
import numpy as np

# Data Visualization:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_squared_error as metric

from lightgbm import LGBMRegressor

In [36]:
class PATH:
    main = 'Data/'
    train = main + 'processed_train.csv'
    test = main + 'processed_test.csv'
    ss = main + 'SampleSubmission.csv'

In [37]:
train_df = pd.read_csv(PATH.train).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)
test_df = pd.read_csv(PATH.test).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)

print(
    f'train shape :{train_df.shape}',
    f'test shape :{test_df.shape}',
    sep='\n'
)

train shape :(7909, 35)
test shape :(2783, 34)


* ### lets drop id,site_id columns since its not predictive,(select only numerical columns):

In [38]:
selected_columns=test_df.select_dtypes(include=('number')).columns.tolist()
selected_columns

['site_latitude',
 'site_longitude',
 'city',
 'country',
 'date',
 'hour',
 'month',
 'carbonmonoxide_co_column_number_density',
 'carbonmonoxide_h2o_column_number_density',
 'carbonmonoxide_cloud_height',
 'formaldehyde_tropospheric_hcho_column_number_density',
 'formaldehyde_tropospheric_hcho_column_number_density_amf',
 'formaldehyde_hcho_slant_column_number_density',
 'ozone_o3_column_number_density',
 'ozone_o3_effective_temperature',
 'cloud_cloud_optical_depth',
 'cloud_surface_albedo',
 '2_groub_mean',
 '3_groub_mean',
 '4_groub_mean',
 '5_groub_mean',
 '6_groub_mean',
 '7_groub_mean',
 '8_groub_mean',
 '10_groub_mean',
 '11_groub_mean',
 '12_groub_mean',
 'date_month',
 'date_day',
 'date_quarter',
 'date_week',
 'date_year']

In [39]:
cv =LeaveOneGroupOut()

X=train_df[selected_columns]
y=train_df.pm2_5

rmse_average = []
test_preds = []
oof=y.copy()

model=LGBMRegressor(verbose=-1)
for fold, (train_index, val_index) in enumerate(cv.split(X,y,X.city)):
  print(50*'-')
  print(f'Fold {fold+1}')
  X_train, X_val = X.iloc[train_index], X.iloc[val_index]
  y_train, y_val = y[train_index], y[val_index]

  val_preds=[]
  model.fit(X_train,y_train)
  val_preds = model.predict(X_val)
  oof.iloc[val_index]=val_preds
    
  rmse = metric(y_val, val_preds,squared=False)

  print(f'train points :{len(train_index)}',f'test points :{len(val_index)}')
  print("Validation RMSE:", rmse)

  rmse_average.append(rmse)

print(f'\n average error :{np.mean(rmse_average)}')

--------------------------------------------------
Fold 1
train points :7793 test points :116
Validation RMSE: 12.107084694069195
--------------------------------------------------
Fold 2
train points :6418 test points :1491
Validation RMSE: 10.388712707356985
--------------------------------------------------
Fold 3
train points :7151 test points :758
Validation RMSE: 14.429345069867999
--------------------------------------------------
Fold 4
train points :2365 test points :5544
Validation RMSE: 12.56511255984818

 average error :12.37256375778559


# make the submission :

In [40]:
# lets fit our model on the whole traim
model.fit(X,y)

In [41]:
preds=model.predict(test_df[selected_columns])
preds

array([11.76405001, 12.83618691,  9.80107672, ..., 49.7226381 ,
       32.97708575, 24.97874375])

In [42]:
sub = pd.DataFrame(
    {
        'id' : test_df.id,
        'pm2_5' : preds
    }
)
sub.head()

Unnamed: 0,id,pm2_5
0,id_tl446ky5so,11.76405
1,id_fv0k1xke9b,12.836187
2,id_17qgwq8hor,9.801077
3,id_h169vb8h8d,11.883006
4,id_o62drksqmp,11.442348


In [43]:
sub.to_csv('submission.csv',index=False)