In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import numpy as np 
import pandas as pd
from scipy.stats import mode
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from sklearn.ensemble import BaggingClassifier

In [3]:
train = pd.read_csv("/kaggle/input/umojahack-2022-faulty-sensors/train.csv")
test = pd.read_csv("/kaggle/input/umojahack-2022-faulty-sensors/test.csv")
sub =  pd.read_csv("/kaggle/input/umojahack-2022-faulty-sensors/SampleSubmission.csv")

In [4]:
train.head()

In [5]:
train.isnull().sum()

In [6]:
test.isnull().sum()

In [7]:
to_drop = ['Temperature','Relative_Humidity']
train = train.drop(to_drop,axis = 1)
test = test.drop(to_drop,axis = 1)

In [8]:
def extract_date(data,date_col):
    
    date =  pd.to_datetime(data[date_col])
    data['year'] = date.dt.year
    data['month'] = date.dt.month
    data['day'] = date.dt.day
    data['dayofweek'] = date.dt.dayofweek
    data['hour'] = date.dt.hour
    data['weekofyear'] = date.dt.weekofyear
    data['quarter'] = date.dt.quarter
    data['is_weekend'] = np.where(data['dayofweek'].isin([5,6]),1,0)
    data['minute'] = date.dt.minute
    data['second'] = date.dt.second
    
    #data = data.drop(date_col,axis = 1)
    

# creating date columns

In [9]:
extract_date(train,'Datetime')
extract_date(test,'Datetime')

# dropping the ID column

In [10]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

In [11]:
#creating a variable for the target
target =  train.Offset_fault

In [12]:
### merging of train and test

In [13]:
tot_data = pd.concat([train.drop('Offset_fault',axis = 1),test],axis  = 0).reset_index(drop = True)

# Feature Engineering

### 1. applying frequency encoding on the data columns created

In [14]:
freq_enc = tot_data.columns[3:]
for col in freq_enc:
    tot_data[col] = tot_data[col].map(tot_data[col].value_counts(normalize=True))

In [15]:
tot_data.head()

### 2. creating time based aggregates for both sensor values

In [16]:
date_col = 'Datetime'
tot_data['Datetime'] = pd.to_datetime(tot_data['Datetime'])
tot_data['date_index1'] = tot_data[date_col].dt.strftime("%m/%d/%Y %H:%M")
tot_data['date_index2'] = tot_data[date_col].dt.strftime("%m/%d/%Y %H")
tot_data['s1_m_avg'] = tot_data['date_index1'].map(tot_data.groupby('date_index1').mean()['Sensor1_PM2.5'])
tot_data['s2_m_avg'] = tot_data['date_index1'].map(tot_data.groupby('date_index1').mean()['Sensor2_PM2.5'])
tot_data['s1_h_avg'] = tot_data['date_index2'].map(tot_data.groupby('date_index2').mean()['Sensor1_PM2.5'])
tot_data['s2_h_avg'] = tot_data['date_index2'].map(tot_data.groupby('date_index2').mean()['Sensor2_PM2.5'])

### Dropping the date columns

In [17]:
to_drop = ['Datetime','date_index1','date_index2']
tot_data =  tot_data.drop(to_drop,axis = 1)

In [18]:
tot_data.head()

### splitting the merged data after feature engineering

In [19]:
split = len(train)
train = tot_data.iloc[:split]
test = tot_data.iloc[split:]

### creating train and validation

In [20]:
X =  train
y=  target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

### Model Building

In [21]:
model_lr = BaggingClassifier(LGBMClassifier(n_estimators=200,random_state = 42),random_state = 42)
model_lr.fit(X_train, y_train)
val_pred = model_lr.predict(X_test)
accuracy_score(y_test,val_pred)

## LB Score : 0.9245053391959799

In [22]:
sub.Offset_fault =  model_lr.predict(test)
sub.to_csv('fe2bag_lgb_.csv',index = False)

### Cross validation : 25-folds

In [23]:
kf = KFold(n_splits=25,shuffle=True, random_state=42) 
y_copy = y.copy()
total_test_pred = []
iteration =1
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    ## classifier used
    model = BaggingClassifier(LGBMClassifier(n_estimators=200,random_state = 42),random_state =  42)
    model.fit(X_train,y_train)
    val_pred  = model.predict(X_test.values)
    y_copy.iloc[test_index] = val_pred
    test_pred = model.predict(test.values)
    total_test_pred.append(test_pred)
    print('iter :::::::::  ', iteration)
    iteration +=1
total_test_pred = mode(total_test_pred,axis = 0)
print('accuracy total', accuracy_score(y,y_copy))

### saving the predicted test 

In [24]:
sub.Offset_fault = total_test_pred[0][0]
sub.to_csv('fe_bag_lgb_cv25csv',index = False)