In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import glob
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import TomekLinks 
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.model_selection import RepeatedStratifiedKFold
import datetime

# Read Data

In [3]:
filePathTrain = "C:/Users/athar/Downloads/encodedDataTrainFitX4.csv"
filePathTest = "C:/Users/athar/Downloads/encodedDataTestFitX4.csv"
trainData = pd.read_csv(filePathTrain)
testData = pd.read_csv(filePathTest)

# Add time column(minutes)

In [4]:
trainData["AOb"] = pd.to_datetime(trainData["Ob"], infer_datetime_format=True).dt.time
trainData[['h', 'm', 's']] = trainData["AOb"].astype(str).str.split(':', expand=True).astype(int) #pd.DataFrame([(x.hour, x.minute, x.second)])
trainData["totalMinutes"] = trainData["m"] + trainData["h"] * 60
trainData = trainData.drop(columns=["AOb", "h", "m", "s"], axis=1)

In [5]:
testData["AOb"] = pd.to_datetime(testData["Ob"], infer_datetime_format=True).dt.time
testData[['h', 'm', 's']] = testData["AOb"].astype(str).str.split(':', expand=True).astype(int) #pd.DataFrame([(x.hour, x.minute, x.second)])
testData["totalMinutes"] = testData["m"] + testData["h"] * 60
testData = testData.drop(columns=["AOb", "h", "m", "s"], axis=1)

# Convert Ob column to datetime format

In [6]:
trainData["Ob"] = pd.to_datetime(trainData["Ob"], infer_datetime_format=True)

# Split data into dataframes based on seasons

In [7]:
springStart = datetime.datetime(2021, 3, 1)
springEnd = datetime.datetime(2021, 5, 31, 23, 59, 59)
springDf = trainData[(trainData["Ob"] >= springStart) & (trainData["Ob"] <= springEnd)]

In [8]:
summer1Start = datetime.datetime(2021, 6, 1)
summer1End = datetime.datetime(2021, 7, 15, 23, 59, 59)
summer1Df = trainData[(trainData["Ob"] >= summer1Start) & (trainData["Ob"] <= summer1End)]
summer1Df.shape

(894420, 67)

In [9]:
summer2Start = datetime.datetime(2021, 7, 16)
summer2End = datetime.datetime(2021, 8, 31, 23, 59, 59)
summer2Df = trainData[(trainData["Ob"] >= summer2Start) & (trainData["Ob"] <= summer2End)]
summer2Df.shape

(1025717, 67)

In [10]:
fall1Start = datetime.datetime(2021, 9, 1)
fall1End = datetime.datetime(2021, 10, 15, 23, 59, 59)
fall1Df = trainData[(trainData["Ob"] >= fall1Start) & (trainData["Ob"] <= fall1End)]
fall1Df.shape

(611694, 67)

In [11]:
fall2Start = datetime.datetime(2021, 10, 16)
fall2End = datetime.datetime(2021, 11, 30, 23, 59, 59)
fall2Df = trainData[(trainData["Ob"] >= fall2Start) & (trainData["Ob"] <= fall2End)]
fall2Df.shape

(678783, 67)

In [12]:
winterStart = datetime.datetime(2021, 12, 1)
winterEnd = datetime.datetime(2021, 12, 31, 23, 59, 59)
winterDf = trainData[(trainData["Ob"] >= winterStart) & (trainData["Ob"] <= winterEnd)]

In [13]:
winter2Start = datetime.datetime(2021, 1, 1)
winter2End = datetime.datetime(2021, 2, 28, 23, 59, 59)
winter2Df = trainData[(trainData["Ob"] >= winter2Start) & (trainData["Ob"] <= winter2End)]
winterDf = pd.concat([winterDf, winter2Df], ignore_index=True)

# Save data frames as csv files

In [14]:
springDf.to_csv("C:/Users/athar/Downloads/spring.csv", index=False)
winterDf.to_csv("C:/Users/athar/Downloads/winter.csv", index=False)
summer1Df.to_csv("C:/Users/athar/Downloads/summer1.csv", index=False)
summer2Df.to_csv("C:/Users/athar/Downloads/summer2.csv", index=False)
fall1Df.to_csv("C:/Users/athar/Downloads/fall1.csv", index=False)
fall2Df.to_csv("C:/Users/athar/Downloads/fall2.csv", index=False)
testData.to_csv("C:/Users/athar/Downloads/testNormEncoded.csv", index=False)