#### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib import rcParams

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import plot_importance, plot_tree

sb.set_style('whitegrid')
%matplotlib inline

#### Loading Dataset

In [None]:
data = pd.read_csv('Sangam_2019_Hackathon_Data.csv')
print(data.shape)

#Some display Changes
pd.set_option('max_columns',50) 
pd.set_option('max_rows',1000) 

In [None]:
copy_data = data.copy()

In [None]:
data.head(5)

# A. Data Preprocessing

#### 1. Exploring the Columns of Dataset

In [None]:
# A. Using descriptive Statistics to find some insights
data.describe()

#### 2. Checking for Missing Values

In [None]:
# Percentage and Sum of Missing values in each Columns
missing_data = pd.DataFrame({'total_missing': data.isnull().sum(), 'perc_missing': (data.isnull().sum()/data.shape[0])*100})
missing_data

In [None]:
#droping Unnecsary Columns
data.drop(['S.no'], axis=1, inplace=True)

#drop all NaN rows
data.dropna(inplace=True) 

# removing irrealavent Values
data.drop(data.loc[(data.device_id == '{"success":1') | (data.device_id == "S81")].index, inplace = True) 

# Part 1(A)

### A(i) Rectifying The Timestamp Information

In [None]:
%%time
#Handleing the timestamps Columns
data.svrtime = pd.to_datetime(data.svrtime,infer_datetime_format=True)
data.timestamp = pd.to_datetime(data.timestamp, errors='coerce', infer_datetime_format=True) 

data.loc[list(data[data.timestamp.isnull()].timestamp.index), 'timestamp']  = data[data.timestamp.isnull()].svrtime

data['svrtime'] = data['svrtime'].astype('str')
data['timestamp'] = data['timestamp'].astype('str')

data['timestamp'] = data.svrtime.str.slice(0,10) + data.timestamp.str.slice(10,)

data.svrtime = pd.to_datetime(data.svrtime,infer_datetime_format=True)
data.timestamp = pd.to_datetime(data.timestamp, infer_datetime_format=True) 


### A(ii) Rectify/Impute the location Data(GPS Cordinates)

In [None]:
# Changing 0 to Nan's
data.latitude = data.latitude.replace(to_replace=[0,1], value=np.nan)
data.longitude = data.longitude.replace(to_replace=[0,1], value=np.nan)

#replacing the S9 values with given Coordinates
data.loc[data.device_id == 'S9', 'latitude']  = data[data.device_id == 'S9'].latitude.fillna(value=13.08827)
data.loc[data.device_id == 'S9', 'longitude'] = data[data.device_id == 'S9'].longitude.fillna(value=80.181568)

#replacing the S4 coordinates with  mode 
data.loc[data.device_id == 'S4', 'latitude']  = data[data.device_id == 'S4'].latitude.fillna(value=data[data.device_id =="S4"].latitude.mode()[0])
data.loc[data.device_id == 'S4', 'longitude'] = data[data.device_id == 'S4'].longitude.fillna(value=data[data.device_id =="S4"].longitude.mode()[0])

#replacing the M7 coordinates with  mode 
data.loc[data.device_id == 'M7', 'latitude']  = data[data.device_id == 'M7'].latitude.fillna(value=data[data.device_id =="M7"].latitude.mode()[0])
data.loc[data.device_id == 'M7', 'longitude'] = data[data.device_id == 'M7'].longitude.fillna(value=data[data.device_id =="M7"].longitude.mode()[0])

#Using FFill method to remove the remaining incorrectness in DATA
#stationary_device = ['S5', 'S3', 'S4', 'S1','S10', 'S8', 'S9', 'S6', 'S7']
for s in list(data.device_id.unique()):
    data.loc[data.device_id == s, 'latitude']  = data[data.device_id == s].latitude.fillna(method='ffill')
    data.loc[data.device_id == s, 'latitude']  = data[data.device_id == s].latitude.fillna(value=data[data.device_id ==s].latitude.mode()[0])

    data.loc[data.device_id == s, 'longitude'] = data[data.device_id == s].longitude.fillna(method='ffill')
    data.loc[data.device_id == s, 'longitude'] = data[data.device_id == s].longitude.fillna(value=data[data.device_id ==s].longitude.mode()[0])


###  B.   Finding Outliers and imputing them

#### 1. Finding Outliers and Imputing  them. ["Humidity", 'Temperature', 'heat_index']

In [None]:
plt.figure(figsize=(12,10))

plt.subplot(3,2, 1)
plt.plot(data.humidity, color='red')
plt.title("Humidity")

plt.subplot(3,2, 2)
plt.plot(data.temperature, color='blue')
plt.title("Temprature")

plt.subplot(3,2, 3)
plt.plot(data.heat_index)
plt.title("Heat Index", color='black')


In [None]:
# Handeling outliers in these columns dataframe
data.loc[data.humidity > 100, 'humidity'] = data.humidity.median()
data.loc[data.temperature > 100 , 'temperature'] = data.temperature.median()
data.loc[data.heat_index < 0 , 'heat_index'] = data.heat_index.median()

In [None]:
# Replacing with Nan
data.humidity = data.humidity.replace(to_replace=[0,1], value=np.nan)
data.temperature = data.temperature.replace(to_replace=[0,1], value=np.nan)
data.heat_index = data.heat_index.replace(to_replace=[0,1], value=np.nan)


# Removing Outliers from Stationary devices from Columns ['humidity', 'temperature', 'heat_index']
st_device = ['S5', 'S3', 'S4', 'S1','S10', 'S8', 'S9', 'S6', 'S7', 'M7', "M1", 'M2', 'M3', 'M4', 'M6']
for s in st_device:
    s_data = data[data.device_id == s]
    mean_hth = s_data.mean().tolist()[2:5]
    
    data.loc[data.device_id == s, 'humidity']  = data[data.device_id == s].humidity.fillna(method='ffill')
    data.loc[data.device_id == s, 'humidity']  = data[data.device_id == s].humidity.fillna(value=mean_hth[0])
    
    data.loc[data.device_id == s, 'temperature']  = data[data.device_id == s].temperature.fillna(method='ffill')
    data.loc[data.device_id == s, 'temperature']  = data[data.device_id == s].temperature.fillna(value=mean_hth[1])
    
    data.loc[data.device_id == s, 'heat_index']  = data[data.device_id == s].heat_index.fillna(method='ffill')
    data.loc[data.device_id == s, 'heat_index']  = data[data.device_id == s].heat_index.fillna(value=mean_hth[2])
    
    #mean_hth = s_data.mean().tolist()[2:5]
    #std_hth  = s_data.std().tolist()[2:5]
    
    #h_index = s_data.loc[s_data.humidity< mean_hth[0]- 1.5*std_hth[0]].index
    #t_index = s_data.loc[(mean_hth[1]+ 5*std_hth[1] <s_data.temperature) | s_data.temperature< mean_hth[1]- 5*std_hth[1]].index
    #ht_index = s_data.loc[(mean_hth[2]+ 4*std_hth[2] <s_data.heat_index) | s_data.heat_index< mean_hth[2]- 5*std_hth[2]].index
    
    #data.loc[h_index, 'humidity'] = mean_hth[0]
    #data.loc[t_index, 'temperature'] = mean_hth[1]
    #data.loc[ht_index, 'heat_index'] = mean_hth[2]
    

# Column Shit is only present in M5 Device so i am replacing the humidity and temperature for those and remaining missing values  is filling with  MEAN    
s_data = data[data.device_id == 'M5']
t_index = s_data[s_data.temperature > 60].index
temp = data.loc[t_index, 'temperature']
humidity = data.loc[t_index, 'humidity']
data.loc[t_index, 'temperature'] = s_data.temperature.mean()
data.loc[t_index, 'humidity'] = temp 




data.dropna(inplace=True)

#### 2. Handeling Outliers of Longitude And Latitudes for Stationary Devices

In [None]:
# Replacing the Latitudes and Longitudes with their mode for each devices
s_device = ['S5', 'S3', 'S4', 'S1', 'M7','S10', 'S8', 'S9', 'S6', 'S7']
for s in s_device:
    
    lat = data[data.device_id == s].latitude.mean()
    log = data[data.device_id == s].longitude.median()

    data.loc[data.device_id == s, 'latitude'] = lat
    data.loc[data.device_id == s, 'longitude'] = log
    

#### 3. Handeling Outliers of Langitude and Latitude for Moving Devices

In [None]:
# IN LATITUDE
m_device = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6']
i=1

plt.figure(figsize=(18,16))
for m in m_device:
    
    m_data = data[data.device_id == m]

    plt.subplot(6,2, i)
    plt.plot(m_data['latitude'], color='green')
    plt.title(m)
    i+=1

In [None]:
m_device = ['M1', 'M2', 'M3', 'M4', 'M5', "M6"]

mx_lat = [29.5,29.5,29.5,29.5,14,14]
mi_lat = [28.2,28.2,28.2,28.2,12.5,12]

i=0

for m in m_device:
    data.loc[(data.device_id == m) & (mx_lat[i]<data['latitude']), 'latitude'] = np.nan
    data.loc[(data.device_id == m) & (data['latitude']<mi_lat[i]), 'latitude'] = np.nan
    
    data.loc[data.device_id == m, 'latitude']  = data[data.device_id == m].latitude.fillna(method ='ffill')
    data.loc[data.device_id == m, 'latitude']  = data[data.device_id == m].latitude.fillna(method ='bfill')
    data.loc[data.device_id == m, 'latitude']  = data[data.device_id == m].latitude.fillna(value=data[data.device_id ==m].latitude.mode())
    i+=1
    

In [None]:
# IN Longitude
m_device = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6']
i=1

plt.figure(figsize=(18,16))
for m in m_device:
    
    m_data = data[data.device_id == m]

    plt.subplot(6,2, i)
    plt.plot(m_data['longitude'], color='blue')
    plt.title(m)
    i+=1

In [None]:
m_device = ['M1', 'M2', 'M3', 'M4', 'M5', "M6"]

mx_log = [78,78,78,78,81,81]
mi_log = [76.5,76.5,76.5,76.5,79,79]

i=0

for m in m_device:
    data.loc[(data.device_id == m) & (mx_log[i]<data['longitude']), 'longitude'] = np.nan
    data.loc[(data.device_id == m) & (data['longitude']<mi_log[i]), 'longitude'] = np.nan
    
    data.loc[data.device_id == m, 'longitude']  = data[data.device_id == m].longitude.fillna(method ='ffill')
    data.loc[data.device_id == m, 'longitude']  = data[data.device_id == m].longitude.fillna(method ='bfill')
    data.loc[data.device_id == m, 'longitude']  = data[data.device_id == m].longitude.fillna(value=data[data.device_id ==m].longitude.mode())
    i+=1

#### 4. Handeling Outliers in UV Column

In [None]:
plt.plot(data.uv)

In [None]:
# Replacing Outliers with mode
uv_list = list(data[data.uv>10].device_id.value_counts().keys())
data.loc[data.uv > 10, "uv"] = np.nan

for u in uv_list:
    data.loc[data.device_id == u, 'uv']  = data[data.device_id == u].uv.fillna(value=data[data.device_id ==u].uv.mode()[0])


#### 5. Handeling Outliers in pm01

In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', 'M7','S10', 'S8', 'S9', 'S6', 'S7']
i=1
plt.figure(figsize=(18,16))

for s in s_device:
    
    s_data = data[data.device_id == s]

    plt.subplot(10,2, i)
    plt.plot(s_data['pm01'], color='red')
    plt.title(s)
    i+=1

In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', "S8","S6", 'S10', 'S9', "S7"]
p01_data = [250,250,250,250,150,150,400,200,100]
i=0

for s in s_device:
    data.loc[(data.device_id == s) & (data['pm01']>p01_data[i]), 'pm01'] = np.nan
    data.loc[data.device_id == s, 'pm01']  = data[data.device_id == s].pm01.fillna(method ='ffill')
    data.loc[data.device_id == s, 'pm01']  = data[data.device_id == s].pm01.fillna(value=data[data.device_id ==s].pm01.mean())
    i+=1
    
#for device M7
data.loc[(data.device_id == "M7") & (data['pm01']<10), 'pm01'] = np.nan
data.loc[data.device_id == "M7", 'pm01']  = data[data.device_id == "M7"].pm01.fillna(method ='ffill')
data.loc[data.device_id == "M7", 'pm01']  = data[data.device_id == "M7"].pm01.fillna(value=data[data.device_id =="M7"].pm01.mean())


#### 6. Handeling Outliers in pm25

In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', 'M7','S10', 'S8', 'S9', 'S6', 'S7']
i=1
plt.figure(figsize=(18,16))

for s in s_device:
    
    s_data = data[data.device_id == s]

    plt.subplot(10,2, i)
    plt.plot(s_data['pm25'], color='red')
    plt.title(s)
    i+=1


In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', "S8","S6", 'S10', 'S9', "S7"]
pm25_data = [400,350,450,450,250,150,500,400,95]
i=0

for s in s_device:
    data.loc[(data.device_id == s) & (data['pm25']>pm25_data[i]), 'pm25'] = np.nan
    data.loc[data.device_id == s, 'pm25']  = data[data.device_id == s].pm25.fillna(method ='ffill')
    data.loc[data.device_id == s, 'pm25']  = data[data.device_id == s].pm25.fillna(value=data[data.device_id ==s].pm25.mean())
    i+=1
    
#for device M7
data.loc[(data.device_id == "M7") & (data['pm25']<20), 'pm25'] = np.nan
data.loc[data.device_id == "M7", 'pm25']  = data[data.device_id == "M7"].pm25.fillna(method ='ffill')
data.loc[data.device_id == "M7", 'pm25']  = data[data.device_id == "M7"].pm25.fillna(value=data[data.device_id =="M7"].pm25.mean())


#### 7. Handeling Outliers in pm10

In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', 'M7','S10', 'S8', 'S9', 'S6', 'S7']
i=1
plt.figure(figsize=(18,16))

for s in s_device:
    
    s_data = data[data.device_id == s]

    plt.subplot(10,2, i)
    plt.plot(s_data['pm10'], color='red')
    plt.title(s)
    i+=1

In [None]:
s_device = ['S5', 'S3', 'S4', 'S1', "S8","S6", 'S10', 'S9', "S7", "M7"]
pm10_data = [450,450,450,480,350,250,450,450,190,45]
i=0

for s in s_device:
    data.loc[(data.device_id == s) & (data['pm10']>pm10_data[i]), 'pm10'] = np.nan
    data.loc[data.device_id == s, 'pm10']  = data[data.device_id == s].pm10.fillna(method ='ffill')
    data.loc[data.device_id == s, 'pm10']  = data[data.device_id == s].pm10.fillna(value=data[data.device_id ==s].pm10.mean())
    i+=1
    
