In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
import glob
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks 
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import RepeatedStratifiedKFold
import datetime

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
filePathTrain = "/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTrainFitX4.csv"
filePathTest = "/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTestFitX4.csv"
trainData = pd.read_csv(filePathTrain)
testData = pd.read_csv(filePathTest)

In [4]:
print(trainData.head())

  Station              Ob     value  target  R_flag  I_flag  Z_flag  B_flag  \
0    AURO   1/2/2021 0:30  0.899196   False       2      -1       0       1   
1    AURO   1/2/2021 4:30  0.902907   False       2      -1       0       1   
2    AURO   1/2/2021 5:30  0.901051   False       2      -1       0       1   
3    AURO   1/2/2021 7:30  0.894867   False       2      -1       0       1   
4    AURO  2/16/2021 2:30  0.889920   False       2      -1       0       1   

   measure_blackglobetemp  measure_impact  ...  measure_rh_wxt  measure_sm  \
0                       0               0  ...               0           0   
1                       0               0  ...               0           0   
2                       0               0  ...               0           0   
3                       0               0  ...               0           0   
4                       0               0  ...               0           0   

   measure_sr  measure_st  measure_temp10  measure_temp_

In [5]:
trainData.columns

Index(['Station', 'Ob', 'value', 'target', 'R_flag', 'I_flag', 'Z_flag',
       'B_flag', 'measure_blackglobetemp', 'measure_impact',
       'measure_leafwetness', 'measure_par', 'measure_precip', 'measure_pres',
       'measure_rh_hmp', 'measure_rh_wxt', 'measure_sm', 'measure_sr',
       'measure_st', 'measure_temp10', 'measure_temp_wxt', 'measure_ws02',
       'measure_ws06', 'measure_ws10', 'val_squared'],
      dtype='object')

In [6]:
dfTrain = pd.get_dummies(trainData.Station, prefix='station')
dfTest = pd.get_dummies(testData.Station, prefix='station')
encodedDataTrain = pd.concat([trainData, dfTrain], axis = 1)
encodedDataTrain = encodedDataTrain.drop(columns=["Station"], axis = 1)
encodedDataTest = pd.concat([testData, dfTest], axis = 1)
encodedDataTest = encodedDataTest.drop(columns=["Station"], axis = 1)

In [7]:
print(encodedDataTest.head())

                Ob     value  R_flag  I_flag  Z_flag  B_flag  \
0  3/25/2021 10:37  0.931973       2      -1       2       0   
1  7/11/2021 13:22  0.975263       2      -1       0       1   
2  7/11/2021 13:23  0.975263       2      -1       0       1   
3  7/11/2021 13:24  0.975263       2      -1       0       1   
4  7/11/2021 13:25  0.975881       2      -1       0       1   

   measure_blackglobetemp  measure_impact  measure_leafwetness  measure_par  \
0                       0               0                    0            0   
1                       0               0                    0            0   
2                       0               0                    0            0   
3                       0               0                    0            0   
4                       0               0                    0            0   

   ...  station_SILR  station_SPIN  station_SPRU  station_TAYL  station_UNCA  \
0  ...             0             0             0            

In [8]:
encodedDataTrain.to_csv("/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTrainFitX4.csv", index=False)
encodedDataTest.to_csv("/content/drive/Shareddrives/ALDA_Project/ALDA/encodedDataTestFitX4.csv", index=False)

In [9]:
print(encodedDataTest.columns)

Index(['Ob', 'value', 'R_flag', 'I_flag', 'Z_flag', 'B_flag',
       'measure_blackglobetemp', 'measure_impact', 'measure_leafwetness',
       'measure_par', 'measure_precip', 'measure_pres', 'measure_rh_hmp',
       'measure_rh_wxt', 'measure_sm', 'measure_sr', 'measure_st',
       'measure_temp10', 'measure_temp_wxt', 'measure_ws02', 'measure_ws06',
       'measure_ws10', 'val_squared', 'station_AURO', 'station_BAHA',
       'station_BALD', 'station_BEAR', 'station_BUCK', 'station_BURN',
       'station_CAST', 'station_CHAP', 'station_CLA2', 'station_CLAY',
       'station_CLIN', 'station_DURH', 'station_FLET', 'station_FRYI',
       'station_GOLD', 'station_HAML', 'station_JACK', 'station_JEFF',
       'station_KINS', 'station_LAKE', 'station_LAUR', 'station_LEWS',
       'station_LILE', 'station_MITC', 'station_NCAT', 'station_NEWL',
       'station_OXFO', 'station_PLYM', 'station_REED', 'station_REID',
       'station_ROCK', 'station_SALI', 'station_SASS', 'station_SILR',
       '