# EE 559 Project
Ronald Huang & Henry Chen

### Import Libraries

In [22]:
# Import packages
import math
import numpy as np
import pandas as pd
import datetime as datetime
#import seaborn as sns
from collections import Counter, deque


# For Computing Priors
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix, accuracy_score


# For Visualisation
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image


# For Model Selection
import warnings
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, KernelPCA
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression


# For creating Tensorflow models
# import keras
# from keras.models import Sequential
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.layers import Dense, Input, Dropout, SimpleRNN, GRU, LSTM, Conv1D
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.models import Sequential
# from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt


# For plotting ROC and Precision Recall curves
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay


# For OS agnostic path handling
from os import path


### Data Import

In [23]:
# train = np.loadtxt('data/FLIR_groups1and2_train.csv', delimiter = ',', skiprows = 2)
# train = pd.read_csv('data/FLIR_groups1and2_train.csv')
# test = pd.read_csv('data/FLIR_groups1and2_test.csv')


data_path = 'data/FLIR_groups1and2_train.csv'

read_train = pd.read_csv(data_path, skiprows = 2)
train = read_train.iloc[:, 2:]



round_1 = train.iloc[:, :26]
round_2 = train.iloc[:, 28:55]
round_3 = train.iloc[:, 56:83]
round_4 = train.iloc[:, 84:111]


In [24]:
def find_empty(round):
    empty_rows = []
    
    for i, row in round.iterrows():
        if row.isnull().all():
            empty_rows.append(i)
            
    return empty_rows


In [11]:
round_1_empty_rows = find_empty(round_1)
round_2_empty_rows = find_empty(round_2)
round_3_empty_rows = find_empty(round_3)
round_4_empty_rows = find_empty(round_4)

print("Empty Rows in Row 1: ", round_1_empty_rows)
print("Empty Rows in Row 2: ", round_2_empty_rows)
print("Empty Rows in Row 3: ", round_3_empty_rows)
print("Empty Rows in Row 4: ", round_4_empty_rows)




Empty Rows in Row 1:  [17, 78, 132, 171, 236, 292, 413, 483, 593, 616, 662]
Empty Rows in Row 2:  [22, 106, 171, 292, 413, 479, 609, 626, 686]
Empty Rows in Row 3:  [22, 294, 386, 489]
Empty Rows in Row 4:  [20, 76, 107, 113, 128, 187, 221, 236, 252, 272, 294, 325, 359, 414, 469, 484, 485, 560, 573, 639, 695]


In [13]:
round_1_means = (round_1.mean()).values  #dtype is float64
round_2_means = (round_2.mean()).values
round_3_means = (round_3.mean()).values
round_4_means = (round_4.mean()).values

print(round_1_means)
print(round_2_means)
print(round_3_means)
print(round_4_means)


[ 0.94230329 35.48566524 35.49520744 34.75422031 34.86494993 35.5460515
 35.47298999 35.42725322 35.57602289 35.52307582 35.49018598 35.33283262
 35.55513591 35.1376824  35.06958512 35.67031474 35.64042918 34.37835479
 34.37904149 34.37323319 34.28218884 34.37639485 35.30287554 34.93545064
 35.77281831 35.23891273]
[ 0.94940086 35.56536377 35.56318117 34.87737518 34.97776034 35.61436519
 35.55590585 35.49155492 35.64512126 35.59132668 35.56472183 35.41392297
 35.62121255 35.21319544 35.15457917 35.73736091 35.70811698 34.54611983
 34.5321826  34.53081312 34.4496291  34.55269615 35.39077033 35.05797432
 35.94385164 35.69333809 35.72673324]
[ 0.94648725 35.58427762 35.59314448 34.9382153  35.04140227 35.63968839
 35.57073654 35.51511331 35.66936261 35.61243626 35.59015581 35.41324363
 35.64195467 35.23888102 35.15695467 35.76186969 35.73325779 34.56661473
 34.56022663 34.54790368 34.47818697 34.56185552 35.41555241 35.08413598
 35.95050992 35.69991501 35.73481586]
[ 0.94583697 35.6101451

In [19]:
def insert_mean(round_num, empty_rows, mean):
    rounds_final = round_num
    
    for i in empty_rows:
        rounds_final.loc[i] = mean

    return rounds_final
        
        

In [20]:
final_round_1 = insert_mean(round_1, round_1_empty_rows, round_1_means)
final_round_2 = insert_mean(round_2, round_2_empty_rows, round_2_means)
final_round_3 = insert_mean(round_3, round_3_empty_rows, round_3_means)
final_round_4 = insert_mean(round_4, round_4_empty_rows, round_4_means)

In [21]:
test_empty = find_empty(final_round_1)
print(test_empty)

[]


### Data Transformation and Exploration

#### Intial Data Exploration

##### Relation Between Attributes

##### Data PreProcessing 
* Removing Outliers
* Filling in missing data/balancing out the data
* Feature Importance
* Feature Extraction/Dimensionality Reduction
* Data Visualisation

In [None]:
# fill in missing data with means 
#calculate means and std of every feature for every round 
# normalize data

#pandas describe
#pandas fillna


# 1. compute stats for each round (mean and std)
# 2. if stats for each round are similar, fill in missing data with means
# 3. concatenate in round order
# 4. group by subject ID
# 4.5. feature engineering
# 5. shuffle wrt subject ID
# 6. split k-fold into train and val (90% 10%)



### Model Training and Validation

#### Trivial Solution

##### Mean Output Value

#### Baseline Models

##### Linear Regression (No Regularisation)

In [None]:
#HC

##### 1NN

In [None]:
#RH

#### In-Class Regression Models

##### Ridge Regression

In [None]:
#RH

##### K-Nearest Neighbours Regression

In [None]:
#HC

##### MSE Regression

In [None]:
#RH

##### Support Vector Regression (SVR)

In [None]:
#HC

#### Out-of-Class Models

##### Random Forest

In [None]:
#RH

##### Gradient Boosting

In [None]:
#HC

##### Long Short Term Memory Neural Network (LSTM)

In [None]:
#HC

### Results

### Final Predictions on Test Set