In [20]:
# data reading:
import pandas as pd
import numpy as np

# Data Visualization:
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
class PATH:
    main = 'Data/'
    train = main + 'Train.csv'
    test = main + 'test.csv'
    ss = main + 'SampleSubmission.csv'

In [22]:
train_df = pd.read_csv(PATH.train).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)
test_df = pd.read_csv(PATH.test).sort_values(by=['country','city','site_id','date','hour']).reset_index(drop=True)

target =train_df[['id','pm2_5']].copy()
print(
    f'train shape :{train_df.shape}',
    f'test shape :{test_df.shape}',
    sep='\n'
)

train shape :(8071, 80)
test shape :(2783, 79)


* ### based on the insights from EDA notebook, we would preprocess the data like:
* drop features wich have more than 60% outliers
* taking the mean of the groups and replace it with the groups features
* drop the target outliers
* implement label encoding for categorical features

# Data Cleaning :

In [23]:
# drop the outliers wich is higher than quantile 98,(more than 98% of the data)
thresh = target.pm2_5.quantile(0.98)
print(thresh)
indexes =np.where(target.pm2_5>thresh)[0]
print(f'there is about {len(indexes)} outliers')

train_df=train_df.drop(indexes).reset_index(drop=True)
target=target.drop(indexes).reset_index(drop=True)


74.32143999999985
there is about 162 outliers


In [24]:
groups =[[8,10,16],
         [11,41],
         [56,68],
         [12,22,34,45,48,57,65,75],
         [13,23,35,44,49,58,64,76],
         [53,54],
         [14,24,36,43,50,59,66,77],
         [15,25,37,42,51,60,67,78],
         [26,27],
         [31,46],
         [21,33,47],
         [69,71,70,72]
         ]
# define a function that takes a list of groups and take the statistic mean and replace the new feature
def Means(data,groups):
    x=data.copy()
    column_names =[x.columns[group].tolist() for group in groups]
    for i,group in enumerate(column_names):
        x[f'{i+1}_groub_mean'] =x[group].mean(axis=1)
        x.drop(column_names[i],axis=1,inplace=True)
    return x

train_df=Means(train_df,groups)
test_df=Means(test_df,groups)

In [25]:
# drop the mostly outliers features :
def MissingPerc(data):
    #calculate the missing values count
    total = data.isnull().sum().sort_values(ascending=False)

    #calculate the missing values percentage
    percent_1 = data.isnull().sum()/data.isnull().count()*100
    percent_2 = (round(percent_1, 1)).sort_values(ascending=False)

    missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
    return missing_data[missing_data['Total']>0]

def DropMissing(data,perc):
    #this function drop the features wich missing values more the perc percent
    missings =MissingPerc(data)
    df =data.drop(missings[missings['%']>perc].index,axis=1)
    return df

missing_cols=MissingPerc(train_df)
missing_cols=missing_cols[missing_cols['%']>60].index.tolist()

train_df.drop(missing_cols,axis=1,inplace=True)
test_df.drop(missing_cols,axis=1,inplace=True)

# Data Preprocessing :

## helper functions :

In [26]:
from sklearn.base import BaseEstimator,TransformerMixin
class TIMES(BaseEstimator,TransformerMixin):
    def __init__(self,features =[],times =[]):
        self.features =features
        self.times =times

    def AddTime(self,data,lst,times=[]):
        df =data.copy()
        if not lst:
            lst=list(df.columns)

        # Convert the date features to datetime objects
        for feature in lst:
            df[feature] = pd.to_datetime(df[feature])
            if ('month' in times) or ('all' in times) or ('default' in times):
                df[f'{feature}_month'] = df[feature].dt.month
            if ('day' in times) or ('all' in times) or ('default' in times):
                df[f'{feature}_day'] = df[feature].dt.day
            if ('quarter' in times) or ('all' in times) or ('default' in times):
                df[f'{feature}_quarter'] = df[feature].dt.quarter
            if ('week' in times) or ('all' in times) or ('default' in times):
                try :
                    df[f'{feature}_week'] = df[feature].dt.week
                except :
                    df[f'{feature}_week'] = df[feature].dt.isocalendar().week
            if ('year' in times) or ('all' in times) or ('default' in times):
                df[f'{feature}_year'] = df[feature].dt.year
            if ('day_month' in times) or ('all' in times):
                df[f'{feature}_day_month'] = df[feature].dt.day.astype(str)+'_'+ df[feature].dt.month.astype(str)
            if ('day_week' in times) or ('all' in times):
                df[f'{feature}_day_week'] = df[feature].dt.day.astype(str)+'_'+ df[feature].dt.week.astype(str)
            if ('week_month' in times) or ('all' in times):
                df[f'{feature}_week_month'] = df[feature].dt.week.astype(str)+'_'+ df[feature].dt.month.astype(str)
            if ('week_year' in times) or ('all' in times):
                df[f'{feature}_week_year'] = df[feature].dt.week.astype(str)+'_'+ df[feature].dt.year.astype(str)
            if ('month_year' in times) or ('all' in times):
                df[f'{feature}_month_year'] = df[feature].dt.month.astype(str)+'_'+ df[feature].dt.year.astype(str)
        return df
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        X =self.AddTime(X,lst =self.features,times =self.times)
        return X

## lets add the time related features

In [27]:
df = pd.concat([train_df.drop('pm2_5',axis=1),test_df])
time =TIMES(features =['date'],times=['default'])
df=time.fit_transform(df)

labels_features=['city','hour', 'month','country','date']+['date_' +time for time in['day','month','week','quarter']]
df[labels_features] = df[labels_features].apply(lambda x: pd. factorize(x)[0])

In [28]:
# lets split our data
train_df  = df[df['id'].isin(train_df['id'].unique())].reset_index(drop=True)
test_df = df[df['id'].isin(test_df['id'].unique())].reset_index(drop=True)#.drop(['id','city'],axis=1)

In [29]:
train_df['pm2_5'] = target[target.id.isin(train_df.id.unique())].pm2_5

* ### finally ,lets save our Data for modelling:

In [30]:
train_df.to_csv(PATH.main+'processed_train.csv',index=False)
test_df.to_csv(PATH.main+'processed_test.csv',index=False)