# UKB feature set with missing threshold at 40%

In [1]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

## 10 year time window

In [18]:
# init
data_path = "/rds/general/project/hda_students_data/live/Group9/General/david/Data"
work_dir = '/rds/general/project/hda_students_data/live/Group9/General/david/'
df10Colnames = pd.read_csv(os.path.join(work_dir, "Data/hes_10yr_A00Z99_bin.csv"), nrows=0).columns
loadcols = df10Colnames[0:1].append(df10Colnames[5::]).append(df10Colnames[3:4]) #drops age,sex, triplet_id, casecont 
booleans = {name:'float32' for name in df10Colnames[5::]}
df10 = pd.read_csv(os.path.join(work_dir, "Data/hes_10yr_A00Z99_bin.csv"),
                                    usecols=loadcols, header=0, dtype=booleans)
ukbcols = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"), nrows=0).columns
loadcolsukb = ukbcols[1:-2] #drops melanoma incidence date and index
dfuk = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"),
                                    usecols=loadcolsukb, header=0)
# merge hes with working dataset 
df_full = pd.merge(dfuk, df10, on='eid', how='left')
print(len(df_full) == len(dfuk)) #make sure no extra rows added
# remember to drop eid after merge
df_full.drop(columns=['eid'], inplace=True)
# drop columns which are objects and >= 80% missing values
df_full = df_full.select_dtypes(exclude=['object'])          
limitPer = len(df_full) * .40
df_full = df_full.dropna(thresh=limitPer, axis=1)
# train_test_split
X = df_full
Y = df_full["casecont"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 289)
x_train = x_train.drop(labels = "casecont", axis = 1)
x_test = x_test.drop(labels = "casecont", axis = 1)
y_train.fillna(value=0, inplace=True)
y_test.fillna(value=0, inplace=True)
print("Initial test_train_split 0.25 and info about test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
# do mode imputation based on trainset, and transform test set
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test) # avoid data leakage
# SMOTE instead of duplicating
sm = SMOTE(sampling_strategy=0.85, random_state = 777)
x_train, y_train = sm.fit_resample(x_train, y_train)
print("Performing SMOTE on test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
print("x_train dims: ", x_train.shape)
print("x_test dims : ", x_test.shape, "\n")

  interactivity=interactivity, compiler=compiler, result=result)


True
Initial test_train_split 0.25 and info about test_set:
Number and prop(%) of cases   :  3838 , % = 0.323
Number and prop(%) of controls:  8030 , % = 0.677
Performing SMOTE on test_set:
Number and prop(%) of cases   :  6825 , % = 0.459
Number and prop(%) of controls:  8030 , % = 0.541
x_train dims:  (14855, 1575)
x_test dims :  (3957, 1575) 



In [19]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.DataFrame(y_train)
x_test_df = pd.DataFrame(x_test)
y_test_df = pd.DataFrame(y_test)

In [20]:
os.chdir("/rds/general/project/hda_students_data/live/Group9/General/Data/Final_final_final_datasets/UKb40")

In [21]:
x_train_df.to_csv("X_train_10yr_UKb40_A00toZ99_bin.csv")
y_train_df.to_csv("Y_train_10yr_UKb40_A00toZ99_bin.csv")
x_test_df.to_csv("X_test_10yr_UKb40_A00toZ99_bin.csv")
y_test_df.to_csv("Y_test_10yr_UKb40_A00toZ99_bin.csv")

## 5 year time window

In [24]:
# init
data_path = "/rds/general/project/hda_students_data/live/Group9/General/david/Data"
work_dir = '/rds/general/project/hda_students_data/live/Group9/General/david/'
df10Colnames = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_5yr_A00Z99_bin.csv"), nrows=0).columns
loadcols = df10Colnames[0:1].append(df10Colnames[5::]).append(df10Colnames[3:4]) #drops age,sex, triplet_id, casecont 
booleans = {name:'float32' for name in df10Colnames[5::]}
df10 = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_5yr_A00Z99_bin.csv"),
                                    usecols=loadcols, header=0, dtype=booleans)
ukbcols = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"), nrows=0).columns
loadcolsukb = ukbcols[1:-2] #drops melanoma incidence date and index
dfuk = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"),
                                    usecols=loadcolsukb, header=0)
# merge hes with working dataset 
df_full = pd.merge(dfuk, df10, on='eid', how='left')
print(len(df_full) == len(dfuk)) #make sure no extra rows added
# remember to drop eid after merge
df_full.drop(columns=['eid'], inplace=True)
# drop columns which are objects and >= 80% missing values
df_full = df_full.select_dtypes(exclude=['object'])          
limitPer = len(df_full) * .40
df_full = df_full.dropna(thresh=limitPer, axis=1)
# train_test_split
X = df_full
Y = df_full["casecont"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 289)
x_train = x_train.drop(labels = "casecont", axis = 1)
x_test = x_test.drop(labels = "casecont", axis = 1)
y_train.fillna(value=0, inplace=True)
y_test.fillna(value=0, inplace=True)
print("Initial test_train_split 0.25 and info about test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
# do mode imputation based on trainset, and transform test set
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test) # avoid data leakage
# SMOTE instead of duplicating
sm = SMOTE(sampling_strategy=0.85, random_state = 777)
x_train, y_train = sm.fit_resample(x_train, y_train)
print("Performing SMOTE on test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
print("x_train dims: ", x_train.shape)
print("x_test dims : ", x_test.shape, "\n")

  interactivity=interactivity, compiler=compiler, result=result)


True
Initial test_train_split 0.25 and info about test_set:
Number and prop(%) of cases   :  3838 , % = 0.323
Number and prop(%) of controls:  8030 , % = 0.677
Performing SMOTE on test_set:
Number and prop(%) of cases   :  6825 , % = 0.459
Number and prop(%) of controls:  8030 , % = 0.541
x_train dims:  (14855, 1461)
x_test dims :  (3957, 1461) 



In [25]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.DataFrame(y_train)
x_test_df = pd.DataFrame(x_test)
y_test_df = pd.DataFrame(y_test)

In [26]:
os.chdir("/rds/general/project/hda_students_data/live/Group9/General/Data/Final_final_final_datasets/UKb40")

In [27]:
x_train_df.to_csv("X_train_5yr_UKb40_A00toZ99_bin.csv")
y_train_df.to_csv("Y_train_5yr_UKb40_A00toZ99_bin.csv")
x_test_df.to_csv("X_test_5yr_UKb40_A00toZ99_bin.csv")
y_test_df.to_csv("Y_test_5yr_UKb40_A00toZ99_bin.csv")

## 3 year time window

In [28]:
# init
data_path = "/rds/general/project/hda_students_data/live/Group9/General/david/Data"
work_dir = '/rds/general/project/hda_students_data/live/Group9/General/david/'
df10Colnames = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_3yr_A00Z99_bin.csv"), nrows=0).columns
loadcols = df10Colnames[0:1].append(df10Colnames[5::]).append(df10Colnames[3:4]) #drops age,sex, triplet_id, casecont 
booleans = {name:'float32' for name in df10Colnames[5::]}
df10 = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_3yr_A00Z99_bin.csv"),
                                    usecols=loadcols, header=0, dtype=booleans)
ukbcols = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"), nrows=0).columns
loadcolsukb = ukbcols[1:-2] #drops melanoma incidence date and index
dfuk = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"),
                                    usecols=loadcolsukb, header=0)
# merge hes with working dataset 
df_full = pd.merge(dfuk, df10, on='eid', how='left')
print(len(df_full) == len(dfuk)) #make sure no extra rows added
# remember to drop eid after merge
df_full.drop(columns=['eid'], inplace=True)
# drop columns which are objects and >= 80% missing values
df_full = df_full.select_dtypes(exclude=['object'])          
limitPer = len(df_full) * .40
df_full = df_full.dropna(thresh=limitPer, axis=1)
# train_test_split
X = df_full
Y = df_full["casecont"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 289)
x_train = x_train.drop(labels = "casecont", axis = 1)
x_test = x_test.drop(labels = "casecont", axis = 1)
y_train.fillna(value=0, inplace=True)
y_test.fillna(value=0, inplace=True)
print("Initial test_train_split 0.25 and info about test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
# do mode imputation based on trainset, and transform test set
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test) # avoid data leakage
# SMOTE instead of duplicating
sm = SMOTE(sampling_strategy=0.85, random_state = 777)
x_train, y_train = sm.fit_resample(x_train, y_train)
print("Performing SMOTE on test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
print("x_train dims: ", x_train.shape)
print("x_test dims : ", x_test.shape, "\n")

  interactivity=interactivity, compiler=compiler, result=result)


True
Initial test_train_split 0.25 and info about test_set:
Number and prop(%) of cases   :  3838 , % = 0.323
Number and prop(%) of controls:  8030 , % = 0.677
Performing SMOTE on test_set:
Number and prop(%) of cases   :  6825 , % = 0.459
Number and prop(%) of controls:  8030 , % = 0.541
x_train dims:  (14855, 1379)
x_test dims :  (3957, 1379) 



In [29]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.DataFrame(y_train)
x_test_df = pd.DataFrame(x_test)
y_test_df = pd.DataFrame(y_test)

In [30]:
os.chdir("/rds/general/project/hda_students_data/live/Group9/General/Data/Final_final_final_datasets/UKb40")

In [31]:
x_train_df.to_csv("X_train_3yr_UKb40_A00toZ99_bin.csv")
y_train_df.to_csv("Y_train_3yr_UKb40_A00toZ99_bin.csv")
x_test_df.to_csv("X_test_3yr_UKb40_A00toZ99_bin.csv")
y_test_df.to_csv("Y_test_3yr_UKb40_A00toZ99_bin.csv")

## 1 year time window

In [32]:
# init
data_path = "/rds/general/project/hda_students_data/live/Group9/General/david/Data"
work_dir = '/rds/general/project/hda_students_data/live/Group9/General/david/'
df10Colnames = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_1yr_A00Z99_bin.csv"), nrows=0).columns
loadcols = df10Colnames[0:1].append(df10Colnames[5::]).append(df10Colnames[3:4]) #drops age,sex, triplet_id, casecont 
booleans = {name:'float32' for name in df10Colnames[5::]}
df10 = pd.read_csv(os.path.join(work_dir, "../Data/1_3_5_10_hes/hes_1yr_A00Z99_bin.csv"),
                                    usecols=loadcols, header=0, dtype=booleans)
ukbcols = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"), nrows=0).columns
loadcolsukb = ukbcols[1:-2] #drops melanoma incidence date and index
dfuk = pd.read_csv(os.path.join(work_dir, "Data/working_dataset_notext.csv"),
                                    usecols=loadcolsukb, header=0)
# merge hes with working dataset 
df_full = pd.merge(dfuk, df10, on='eid', how='left')
print(len(df_full) == len(dfuk)) #make sure no extra rows added
# remember to drop eid after merge
df_full.drop(columns=['eid'], inplace=True)
# drop columns which are objects and >= 80% missing values
df_full = df_full.select_dtypes(exclude=['object'])          
limitPer = len(df_full) * .40
df_full = df_full.dropna(thresh=limitPer, axis=1)
# train_test_split
X = df_full
Y = df_full["casecont"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 289)
x_train = x_train.drop(labels = "casecont", axis = 1)
x_test = x_test.drop(labels = "casecont", axis = 1)
y_train.fillna(value=0, inplace=True)
y_test.fillna(value=0, inplace=True)
print("Initial test_train_split 0.25 and info about test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
# do mode imputation based on trainset, and transform test set
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x_train)
x_train = imputer.transform(x_train)
x_test = imputer.transform(x_test) # avoid data leakage
# SMOTE instead of duplicating
sm = SMOTE(sampling_strategy=0.85, random_state = 777)
x_train, y_train = sm.fit_resample(x_train, y_train)
print("Performing SMOTE on test_set:")
print("Number and prop(%) of cases   : ", (y_train == 1).sum(), 
            ", % =", round((y_train == 1).sum()/len(y_train), 3))
print("Number and prop(%) of controls: ", (y_train == 0).sum(), 
            ", % =", round((y_train == 0).sum()/len(y_train), 3))
print("x_train dims: ", x_train.shape)
print("x_test dims : ", x_test.shape, "\n")

  interactivity=interactivity, compiler=compiler, result=result)


True
Initial test_train_split 0.25 and info about test_set:
Number and prop(%) of cases   :  3838 , % = 0.323
Number and prop(%) of controls:  8030 , % = 0.677
Performing SMOTE on test_set:
Number and prop(%) of cases   :  6825 , % = 0.459
Number and prop(%) of controls:  8030 , % = 0.541
x_train dims:  (14855, 1143)
x_test dims :  (3957, 1143) 



In [33]:
x_train_df = pd.DataFrame(x_train)
y_train_df = pd.DataFrame(y_train)
x_test_df = pd.DataFrame(x_test)
y_test_df = pd.DataFrame(y_test)

In [34]:
os.chdir("/rds/general/project/hda_students_data/live/Group9/General/Data/Final_final_final_datasets/UKb40")

In [35]:
x_train_df.to_csv("X_train_1yr_UKb40_A00toZ99_bin.csv")
y_train_df.to_csv("Y_train_1yr_UKb40_A00toZ99_bin.csv")
x_test_df.to_csv("X_test_1yr_UKb40_A00toZ99_bin.csv")
y_test_df.to_csv("Y_test_1yr_UKb40_A00toZ99_bin.csv")