In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
import glob
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks 
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import RepeatedStratifiedKFold
import datetime

In [3]:
filePathTrain = "/content/drive/Shareddrives/ALDA_Project/ALDA/train.csv"
filePathTest = "/content/drive/Shareddrives/ALDA_Project/ALDA/test.csv"
trainData = pd.read_csv(filePathTrain)
testData = pd.read_csv(filePathTest)

# Convert extra columns in test to suitable columns in train

In [4]:
testData['measure'] = np.where(testData['measure'] == 'gust02', 'ws02', testData['measure'])
testData['measure'] = np.where(testData['measure'] == 'wd02', 'ws02', testData['measure'])
testData['measure'] = np.where(testData['measure'] == 'gust10', 'ws10', testData['measure'])
testData['measure'] = np.where(testData['measure'] == 'wd10', 'ws10', testData['measure'])

# One Hot encode train and test data for measure colum

In [5]:
dfTrain = pd.get_dummies(trainData.measure, prefix='measure')
dfTest = pd.get_dummies(testData.measure, prefix='measure')
encodedDataTrain = pd.concat([trainData[["Station", "Ob", "value", "target", "R_flag", "I_flag", "Z_flag", 'B_flag']], dfTrain], axis = 1)
encodedDataTest = pd.concat([testData[["Station", "Ob", "value", "R_flag", "I_flag", "Z_flag", 'B_flag']], dfTest], axis = 1)

In [6]:
print(encodedDataTrain.columns)
print(encodedDataTest.columns)

Index(['Station', 'Ob', 'value', 'target', 'R_flag', 'I_flag', 'Z_flag',
       'B_flag', 'measure_blackglobetemp', 'measure_impact',
       'measure_leafwetness', 'measure_par', 'measure_precip', 'measure_pres',
       'measure_rh_hmp', 'measure_rh_wxt', 'measure_sm', 'measure_sr',
       'measure_st', 'measure_temp10', 'measure_temp_wxt', 'measure_ws02',
       'measure_ws06', 'measure_ws10'],
      dtype='object')
Index(['Station', 'Ob', 'value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag',
       'measure_blackglobetemp', 'measure_impact', 'measure_leafwetness',
       'measure_par', 'measure_precip', 'measure_pres', 'measure_rh_hmp',
       'measure_rh_wxt', 'measure_sm', 'measure_sr', 'measure_st',
       'measure_temp10', 'measure_temp_wxt', 'measure_ws02', 'measure_ws06',
       'measure_ws10'],
      dtype='object')


In [8]:
encodedDataTrainX = encodedDataTrain.drop(columns=["target"], axis=1)
encodedDataTrainX1 = encodedDataTrain.drop(columns=["Ob", "Station"], axis=1)
encodedDataTrainY = encodedDataTrain["target"]


encodedDataTestX1 = encodedDataTest.drop(columns=["Ob", "Station"], axis=1)

In [9]:
print(encodedDataTrainY)

0          False
1          False
2          False
3          False
4          False
           ...  
6593269    False
6593270    False
6593271    False
6593272    False
6593273    False
Name: target, Length: 6593274, dtype: bool


In [10]:
totalDf = pd.concat([encodedDataTrainX1, encodedDataTestX1], axis = 0)
totalDf.columns

Index(['value', 'target', 'R_flag', 'I_flag', 'Z_flag', 'B_flag',
       'measure_blackglobetemp', 'measure_impact', 'measure_leafwetness',
       'measure_par', 'measure_precip', 'measure_pres', 'measure_rh_hmp',
       'measure_rh_wxt', 'measure_sm', 'measure_sr', 'measure_st',
       'measure_temp10', 'measure_temp_wxt', 'measure_ws02', 'measure_ws06',
       'measure_ws10', 'val_squared'],
      dtype='object')

# Standardize data using MinMaxScalar based on measure values

In [11]:
normalizedTrainDf = pd.DataFrame()
normalizedTestDf = pd.DataFrame()

for columns in encodedDataTrainX1.columns[-16:]:
    existsTrainDf = encodedDataTrain.loc[encodedDataTrain[columns] == 1]
    existsTestDf = encodedDataTest.loc[encodedDataTest[columns] == 1]
    print(columns)
    # print(existsTrainDf.head())
    myScaler1 = MinMaxScaler(feature_range=(0, 1))#RobustScaler()
    # myScaler = MinMaxScaler(feature_range=(0, 1)) #StandardScaler() 
    myFit1 = myScaler1.fit(existsTrainDf[['value']])
    myTransformedTrain = myScaler1.transform(existsTrainDf[['value']])
    myTransformedTest = myScaler1.transform(existsTestDf[['value']])

    myTrainFitDf = pd.DataFrame(myTransformedTrain, columns=['value'], index = existsTrainDf.index)
  
    myTestFitDf = pd.DataFrame(myTransformedTest, columns=['value'], index = existsTestDf.index)


    existsTrainDf['value'] = myTrainFitDf['value']
    existsTestDf['value'] = myTestFitDf['value']


    normalizedTrainDf = pd.concat([normalizedTrainDf, existsTrainDf], axis=0)
    normalizedTestDf = pd.concat([normalizedTestDf, existsTestDf], axis=0)


print(normalizedTrainDf.shape)
print(normalizedTrainDf.head(5))
normalizedTrainDf = normalizedTrainDf.sort_index(ascending=True)
print(normalizedTrainDf.shape)
print(normalizedTrainDf.head(5))

normalizedTestDf = normalizedTestDf.sort_index(ascending=True)


measure_impact
(2, 25)
         value
1371424    0.0
2780679    1.0
(2, 25)
measure_leafwetness
(995, 25)
           value
412135  0.994596
412136  0.995947
412137  1.000000
412138  0.998649
412139  0.998649
(995, 25)
measure_par
(505886, 25)
        value
440  0.011877
441  0.012865
442  0.011877
443  0.011877
444  0.011877
(505886, 25)
measure_precip
(2859, 25)
            value
438      0.064516
439      0.059140
472831   0.059140
472832   0.064516
1221784  0.000000
(2859, 25)
measure_pres
(75281, 25)
            value
839302   0.946274
1061176  0.516464
1061177  0.516464
1061178  0.516464
1061179  0.516464
(75281, 25)
measure_rh_hmp
(1340, 25)
           value
471858  0.034233
471859  0.035292
471860  0.038108
471861  0.038098
471862  0.052172
(1340, 25)
measure_rh_wxt
(62329, 25)
     value
339  0.857
340  0.798
341  0.497
342  0.513
343  0.896
(62329, 25)
measure_sm
(3794242, 25)
          value
18277  0.439149
18278  0.439149
18279  0.439149
18280  0.439149
18281  0.439149
(3794

# Save normalized dataframe to csv

In [13]:
normalizedTrainDf.to_csv("/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTrainFitX4.csv", index=False)
normalizedTestDf.to_csv("/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTestFitX4.csv", index=False)