# EE 559 Project
Ronald Huang & Henry Chen

### Import Libraries

In [68]:
# Import packages
import math
import numpy as np
import pandas as pd
import datetime as datetime
#import seaborn as sns
from collections import Counter, deque


# For Computing Priors
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, classification_report,confusion_matrix, accuracy_score


# For Visualisation
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image


# For Model Selection
import warnings
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA, KernelPCA
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
from sklearn.metrics import log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression


# For creating Tensorflow models
# import keras
# from keras.models import Sequential
# from keras.wrappers.scikit_learn import KerasClassifier
# from keras.layers import Dense, Input, Dropout, SimpleRNN, GRU, LSTM, Conv1D
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.models import Sequential
# from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt


# For plotting ROC and Precision Recall curves
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import PrecisionRecallDisplay


# For OS agnostic path handling
from os import path


### Data Import

In [69]:
# train = np.loadtxt('data/FLIR_groups1and2_train.csv', delimiter = ',', skiprows = 2)
# train = pd.read_csv('data/FLIR_groups1and2_train.csv')
# test = pd.read_csv('data/FLIR_groups1and2_test.csv')

#Read in data

data_path = 'data/FLIR_groups1and2_train.csv'

read_data = pd.read_csv(data_path, skiprows = 2)
training_data_set = read_data.iloc[:, 2:]

#Seperate by Rounds

round_1 = training_data_set.iloc[:, :27]
round_2 = training_data_set.iloc[:, 28:55]
round_3 = training_data_set.iloc[:, 56:83]
round_4 = training_data_set.iloc[:, 84:111]


In [70]:
training_data_set

Unnamed: 0,T_offset_1,Max1R13_1,Max1L13_1,aveAllR13_1,aveAllL13_1,T_RC_1,T_RC_Dry_1,T_RC_Wet_1,T_RC_Max_1,T_LC_1,...,T_OR_Max_4,Unnamed: 113,Gender,Age,Ethnicity,T_atm,Humidity,Distance,Unnamed: 120,aveOralM
0,0.58,34.98,35.36,34.44,34.85,34.91,34.91,34.60,34.98,35.31,...,36.39,,Male,41-50,White,24.0,28.0,0.80,,36.59
1,0.83,34.71,34.51,34.46,34.24,34.68,34.68,34.44,34.71,34.65,...,35.84,,Female,31-40,Black or African-American,24.0,26.0,0.80,,37.19
2,0.85,35.70,35.44,35.00,34.78,35.67,35.67,35.46,35.70,35.41,...,36.40,,Female,21-30,White,24.0,26.0,0.80,,37.34
3,0.90,35.17,35.50,34.25,35.00,35.14,35.14,35.08,35.17,35.50,...,35.08,,Female,21-30,Black or African-American,24.0,27.0,0.80,,37.09
4,1.08,35.33,35.55,34.31,35.14,35.50,35.30,35.50,35.52,35.53,...,36.64,,Male,18-20,White,24.0,27.0,0.80,,37.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
705,0.89,35.68,35.62,35.06,35.10,35.62,35.62,35.24,35.68,35.60,...,36.72,,Female,18-20,White,24.4,13.5,0.60,,36.89
706,0.96,35.58,35.58,35.27,35.28,35.56,35.56,35.47,35.58,35.58,...,36.74,,Female,21-25,Asian,24.4,14.7,0.63,,37.14
707,0.91,36.82,36.47,36.21,36.15,36.81,36.80,36.75,36.82,36.45,...,37.32,,Male,21-25,Multiracial,22.0,30.0,0.60,,37.79
708,1.10,36.98,36.96,36.50,36.29,36.94,36.94,36.85,36.98,36.98,...,37.35,,Male,18-20,White,22.0,30.0,0.60,,38.14


In [71]:
extra_features = training_data_set[['Gender', 'Age', 'Ethnicity', 'T_atm', 'Humidity', 'Distance', 'aveOralM']]

In [72]:
#Function to find empty rows 
def find_empty(round):
    empty_rows = []
    
    for i, row in round.iterrows():
        if row.isnull().all():
            empty_rows.append(i)
            
    return empty_rows


In [73]:
round_1_empty_rows = find_empty(round_1)
round_2_empty_rows = find_empty(round_2)
round_3_empty_rows = find_empty(round_3)
round_4_empty_rows = find_empty(round_4)

print("Empty Rows in Row 1: ", round_1_empty_rows)
print("Empty Rows in Row 2: ", round_2_empty_rows)
print("Empty Rows in Row 3: ", round_3_empty_rows)
print("Empty Rows in Row 4: ", round_4_empty_rows)


Empty Rows in Row 1:  [17, 78, 132, 171, 236, 292, 413, 483, 593, 616, 662]
Empty Rows in Row 2:  [22, 106, 171, 292, 413, 479, 609, 626, 686]
Empty Rows in Row 3:  [22, 294, 386, 489]
Empty Rows in Row 4:  [20, 76, 107, 113, 128, 187, 221, 236, 252, 272, 294, 325, 359, 414, 469, 484, 485, 560, 573, 639, 695]


In [74]:
#Compute the means of every feature

round_1_means = (round_1.mean()).values  #dtype is float64
round_2_means = (round_2.mean()).values
round_3_means = (round_3.mean()).values
round_4_means = (round_4.mean()).values

# print(round_1_means)
# print(round_2_means)
# print(round_3_means)
# print(round_4_means)


In [75]:
#Function to insert means into missing rows 

def insert_mean(round_num, empty_rows, mean):
    rounds_final = round_num
    
    for i in empty_rows:
        rounds_final.loc[i] = mean

    return rounds_final  
        

In [76]:
final_round_1 = insert_mean(round_1, round_1_empty_rows, round_1_means)
final_round_2 = insert_mean(round_2, round_2_empty_rows, round_2_means)
final_round_3 = insert_mean(round_3, round_3_empty_rows, round_3_means)
final_round_4 = insert_mean(round_4, round_4_empty_rows, round_4_means)

In [77]:
test_empty = find_empty(final_round_4)
print(test_empty)

[]


In [88]:
# final_round_1

# print(final_round_1.shape)
# print(final_round_2.shape)
# print(final_round_3.shape)
# print(final_round_4.shape)

(710, 27)
(710, 27)
(710, 27)
(710, 27)


In [86]:
print(final_round_1.iloc[0, :].size)

27


In [117]:
# Get final dataframe by calculating mean across 4 tables for each cell
# concatenated_df = pd.concat([final_round_1, final_round_2, final_round_3, final_round_4])
# final_df_values = concatenated_df.groupby(concatenated_df.index).mean()
# final_df_values = (final_round_1 + final_round_2 + final_round_3 + final_round_4) / 4

# final_df_values

#compute final dataset with means from the 4 rounds (NO extra features yet)

final_dataset_means = pd.DataFrame()

for i in range(final_round_1.iloc[0, :].size):
    avg_values = (final_round_1.iloc[:, i] + final_round_2.iloc[:, i] + final_round_3.iloc[:, i] + final_round_4.iloc[:, i]) / 4
    final_dataset_means = pd.concat([final_dataset, avg_values], axis = 1)

In [119]:
# final_dataset_means
# print(final_dataset.shape)


final_dataset = pd.concat([final_dataset_means, extra_features], axis=1)
# final_dataset
print(final_dataset)


# final_data_test = find_empty(final_dataset)
# print(final_data_test)

          0        0        0        0        0        0        0        0  \
0    0.7025  35.0300  35.3775  34.4000  34.9175  34.9850  34.9850  34.7625   
1    0.7800  34.5500  34.5200  33.9300  34.2250  34.7100  34.6325  34.6400   
2    0.8625  35.6525  35.5175  34.2775  34.8000  35.6850  35.6675  35.6150   
3    0.9300  35.2225  35.6125  34.3850  35.2475  35.2075  35.2000  35.1175   
4    0.8950  35.5450  35.6650  34.9100  35.3675  35.6025  35.4750  35.5700   
..      ...      ...      ...      ...      ...      ...      ...      ...   
705  0.9325  35.4800  35.5300  34.9000  34.9900  35.5650  35.5650  35.1350   
706  0.8550  35.6550  35.5325  35.1925  35.2075  35.6125  35.6000  35.4850   
707  0.9700  36.7325  36.4600  36.2225  36.1150  36.7175  36.7150  36.6400   
708  1.0725  36.9450  37.0675  36.3825  36.4825  36.9250  36.9200  36.8200   
709  1.0750  36.9825  35.6625  35.8875  35.3825  37.1000  36.9700  37.0350   

           0        0  ...        0        0        0  Gender  

In [50]:
final_dataset.dtypes

T_offset_1     float64
Max1R13_1      float64
Max1L13_1      float64
aveAllR13_1    float64
aveAllL13_1    float64
                ...   
Ethnicity       object
T_atm          float64
Humidity       float64
Distance       float64
aveOralM       float64
Length: 115, dtype: object

### Data Transformation and Exploration

#### Intial Data Exploration

##### Relation Between Attributes

##### Data PreProcessing 
* Removing Outliers
* Filling in missing data/balancing out the data
* Feature Importance
* Feature Extraction/Dimensionality Reduction
* Data Visualisation

In [12]:
# fill in missing data with means 
#calculate means and std of every feature for every round 
# normalize data

#pandas describe
#pandas fillna


# 1. compute stats for each round (mean and std)
# 2. if stats for each round are similar, fill in missing data with means
# 3. concatenate in round order
# 4. group by subject ID
# 4.5. feature engineering
# 5. shuffle wrt subject ID
# 6. split k-fold into train and val (90% 10%)



### Model Training and Validation

#### Trivial Solution

##### Mean Output Value

#### Baseline Models

##### Linear Regression (No Regularisation)

In [13]:
#HC

##### 1NN

In [14]:
#RH

#### In-Class Regression Models

##### Ridge Regression

In [15]:
#RH

##### K-Nearest Neighbours Regression

In [16]:
#HC

##### MSE Regression

In [17]:
#RH

##### Support Vector Regression (SVR)

In [18]:
#HC

#### Out-of-Class Models

##### Random Forest

In [19]:
#RH

##### Gradient Boosting

In [20]:
#HC

##### Long Short Term Memory Neural Network (LSTM)

In [21]:
#HC

### Results

### Final Predictions on Test Set