In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import copy
import zipfile
import statistics

In [None]:
zf = zipfile.ZipFile('/content/drive/MyDrive/ML Project/ALS trail data_all/2016_01_04_ALL_FORMS_CSV.zip')
namelist = zf.namelist()
namelist

['AdverseEvents.csv',
 'alsfrs.csv',
 'AlsHistory.csv',
 'ConMeds.csv',
 'DeathData.csv',
 'demographics.csv',
 'FamilyHistory.csv',
 'Fvc.csv',
 'Labs.csv',
 'Riluzole.csv',
 'Svc.csv',
 'Treatment.csv',
 'VitalSigns.csv']

**Symptoms and outcome measures (FVC, SVC）**
# FVC (forced vital capacity)
+ Forced vital capacity is the volume of air that can forcibly be blown out after full inspiration, measured in liters. 

+ Subject Liters
+ Subject Normal: the expected value for a non-ALS patient (control) matched by gender, age and height
+ Forced Vital Capacity Units: in liters
+ Forced Vital Capacity Delta: time of test from the start of the trial


In [None]:
DF_fvc  = pd.read_csv(zf.open('Fvc.csv'))
DF_fvc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48856 entries, 0 to 48855
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   subject_id                   48856 non-null  int64  
 1   Subject_Liters_Trial_1       45572 non-null  float64
 2   pct_of_Normal_Trial_1        18746 non-null  float64
 3   Subject_Liters_Trial_2       16552 non-null  float64
 4   pct_of_Normal_Trial_2        9644 non-null   float64
 5   Subject_Liters_Trial_3       16425 non-null  float64
 6   pct_of_Normal_Trial_3        9567 non-null   float64
 7   Forced_Vital_Capacity_Delta  46308 non-null  float64
 8   Subject_Normal               17304 non-null  float64
 9   Forced_Vital_Capacity_Units  20082 non-null  object 
dtypes: float64(8), int64(1), object(1)
memory usage: 3.7+ MB


In [None]:
DF_fvc.head(20)

Unnamed: 0,subject_id,Subject_Liters_Trial_1,pct_of_Normal_Trial_1,Subject_Liters_Trial_2,pct_of_Normal_Trial_2,Subject_Liters_Trial_3,pct_of_Normal_Trial_3,Forced_Vital_Capacity_Delta,Subject_Normal,Forced_Vital_Capacity_Units
0,89,2.89,,,,,,0.0,3.83,
1,329,2.49,97.0,2.57,,2.6,,0.0,,
2,329,2.71,101.0,2.62,,2.72,,16.0,,
3,329,2.75,103.0,2.62,,2.53,,42.0,,
4,329,2.79,104.0,2.62,,2.61,,72.0,,
5,329,2.24,103.0,2.44,,2.57,,135.0,,
6,329,2.52,95.0,2.34,,2.54,,189.0,,
7,329,2.4,89.0,2.2,,2.37,,247.0,,
8,329,2.45,92.0,2.39,,2.42,,274.0,,
9,406,3.69,,,,,,0.0,4.43,


In [None]:
DF_sub = DF_fvc[DF_fvc['Forced_Vital_Capacity_Delta'] <= 92]
DF_sub = DF_sub.drop(['Forced_Vital_Capacity_Units','Subject_Normal','pct_of_Normal_Trial_1','Subject_Liters_Trial_2','pct_of_Normal_Trial_2','Subject_Liters_Trial_3','pct_of_Normal_Trial_3'], axis=1)
DF_sub = DF_sub[DF_sub['Subject_Liters_Trial_1'].notna() & DF_sub['Forced_Vital_Capacity_Delta'].notna()]
DF_sub = DF_sub.drop(index=DF_sub[DF_sub.duplicated(subset=['subject_id','Forced_Vital_Capacity_Delta'], keep="last")].index)
DF_sub

Unnamed: 0,subject_id,Subject_Liters_Trial_1,Forced_Vital_Capacity_Delta
0,89,2.89,0.0
1,329,2.49,0.0
2,329,2.71,16.0
3,329,2.75,42.0
4,329,2.79,72.0
...,...,...,...
48845,999880,1.78,70.0
48850,999990,2.43,0.0
48851,999990,2.70,12.0
48852,999990,2.47,40.0


In [None]:
def fvc_timeseries_preprocess(DF_sub):
  data_set_name = 'fvc'
  content = 'liters'
  DF_total = pd.DataFrame()
  for subjectId in DF_sub['subject_id'].unique():
    # print(subjectId)
    DF_sub_data = DF_sub[DF_sub['subject_id'] == subjectId]
    DF_sub_value = np.array(DF_sub_data['Subject_Liters_Trial_1'])
    DF_sub_delta = np.array(DF_sub_data['Forced_Vital_Capacity_Delta'])
    # print(DF_sub_data)
    # Calculate derivative
    DF_sub_derivative = pd.DataFrame(columns=['subject_id','Subject_Liters_Trial_1','Forced_Vital_Capacity_Delta'])
    if len(DF_sub_data) > 1:
      for i in range(len(DF_sub_data)-1):
        if DF_sub_data.iloc[i+1,2] == DF_sub_data.iloc[i,2]:
          pass 
        else:
          subject_id = DF_sub_data.iloc[0,0]
          measurement_values = DF_sub_data.iloc[i+1,1] - DF_sub_data.iloc[i,1]
          delta_values_gap = DF_sub_data.iloc[i+1,2] - DF_sub_data.iloc[i,2]
          delta_values = (DF_sub_data.iloc[i+1,2] + DF_sub_data.iloc[i,2])/2
          DF_sub_derivative.loc[i] = [subject_id, measurement_values/delta_values_gap, delta_values]
    else:
      DF_sub_derivative = DF_sub_data

    DF_sub_slope_value = np.array(DF_sub_derivative['Subject_Liters_Trial_1'])
    DF_sub_slope_delta = np.array(DF_sub_derivative['Forced_Vital_Capacity_Delta'])
    # print(DF_sub_slope_delta, DF_sub_delta)
    DF_sub_delta_gap = 0
    DF_sub_slope_delta_gap = 0
    if (DF_sub_delta[-1]==DF_sub_delta[0]):
      DF_sub_delta_gap = 0
    else:
      DF_sub_delta_gap = (DF_sub_value[-1]-DF_sub_value[0])/(DF_sub_delta[-1]-DF_sub_delta[0])

    if (len(DF_sub_slope_delta)<=1):
      DF_sub_slope_delta_gap = 0
    else:
      DF_sub_slope_delta_gap = (DF_sub_slope_value[-1]-DF_sub_slope_value[0])/(DF_sub_slope_delta[-1]-DF_sub_slope_delta[0])

    DF_sub_processed = pd.DataFrame.from_dict({
        'subject_id': subjectId,
        '{0}_{1}_ValueMax'.format(data_set_name,content): DF_sub_value.max(),
        '{0}_{1}_ValueMin'.format(data_set_name,content): DF_sub_value.min(),
        '{0}_{1}_ValueLast'.format(data_set_name,content): DF_sub_value[-1],
        '{0}_{1}_ValueMean'.format(data_set_name,content): np.mean(DF_sub_value),
        '{0}_{1}_ValueNumber'.format(data_set_name,content): len(DF_sub_value),
        '{0}_{1}_valueSum'.format(data_set_name,content): DF_sub_value.sum(),
        '{0}_{1}_DeltaFirst'.format(data_set_name,content): DF_sub_delta[0],
        '{0}_{1}_DeltaLast'.format(data_set_name,content): DF_sub_delta[-1],
        '{0}_{1}_ValueMeanSquare'.format(data_set_name,content): np.square(DF_sub_value).mean(),
        '{0}_{1}_ValueStd'.format(data_set_name,content): DF_sub_value.std(),
        '{0}_{1}_DeltaSlope'.format(data_set_name,content): [DF_sub_delta_gap], 
        '{0}_{1}_SlopeMax'.format(data_set_name,content): DF_sub_slope_value.max(),
        '{0}_{1}_SlopeMin'.format(data_set_name,content): DF_sub_slope_value.min(),
        '{0}_{1}_SlopeLast'.format(data_set_name,content): DF_sub_slope_value[-1],
        '{0}_{1}_SlopeMean'.format(data_set_name,content): np.mean(DF_sub_slope_value),
        '{0}_{1}_SlopeNumber'.format(data_set_name,content): len(DF_sub_slope_value),
        '{0}_{1}_SlopeSum'.format(data_set_name,content): DF_sub_slope_value.sum(),
        '{0}_{1}_SlopeDeltaFirst'.format(data_set_name,content): DF_sub_slope_delta[0],
        '{0}_{1}_SlopeDeltaLast'.format(data_set_name,content): DF_sub_slope_delta[-1],
        '{0}_{1}_SlopeMeanSquare'.format(data_set_name,content): np.square(DF_sub_slope_value).mean(),
        '{0}_{1}_SlopeStd'.format(data_set_name,content): DF_sub_slope_value.std(),
        '{0}_{1}_SlopeDeltaSlope'.format(data_set_name,content):[DF_sub_slope_delta_gap], 
    })
    DF_total = DF_total.append(DF_sub_processed,ignore_index=True)
    # print("DF_sub_processed",DF_sub_processed)

  return DF_total

In [None]:
DF_total = fvc_timeseries_preprocess(DF_sub)
DF_total

Unnamed: 0,subject_id,fvc_liters_ValueMax,fvc_liters_ValueMin,fvc_liters_ValueLast,fvc_liters_ValueMean,fvc_liters_ValueNumber,fvc_liters_valueSum,fvc_liters_DeltaFirst,fvc_liters_DeltaLast,fvc_liters_ValueMeanSquare,fvc_liters_ValueStd,fvc_liters_DeltaSlope,fvc_liters_SlopeMax,fvc_liters_SlopeMin,fvc_liters_SlopeLast,fvc_liters_SlopeMean,fvc_liters_SlopeNumber,fvc_liters_SlopeSum,fvc_liters_SlopeDeltaFirst,fvc_liters_SlopeDeltaLast,fvc_liters_SlopeMeanSquare,fvc_liters_SlopeStd,fvc_liters_SlopeDeltaSlope
0,89,2.89,2.89,2.89,2.8900,1,2.89,0.0,0.0,8.352100,0.000000,0.000000,2.890000,2.890000,2.890000,2.890000,1,2.890000,0.0,0.0,8.352100,0.000000,0.000000
1,329,2.79,2.49,2.79,2.6850,4,10.74,0.0,72.0,7.222700,0.116082,0.004167,0.013750,0.001333,0.001333,0.005541,3,0.016622,8.0,57.0,0.000064,0.005806,-0.000253
2,406,3.69,3.69,3.69,3.6900,1,3.69,0.0,0.0,13.616100,0.000000,0.000000,3.690000,3.690000,3.690000,3.690000,1,3.690000,0.0,0.0,13.616100,0.000000,0.000000
3,411,2.44,2.44,2.44,2.4400,1,2.44,0.0,0.0,5.953600,0.000000,0.000000,2.440000,2.440000,2.440000,2.440000,1,2.440000,0.0,0.0,5.953600,0.000000,0.000000
4,533,1.92,1.76,1.76,1.8400,2,3.68,0.0,12.0,3.392000,0.080000,-0.013333,-0.013333,-0.013333,-0.013333,-0.013333,1,-0.013333,6.0,6.0,0.000178,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7352,999482,4.40,3.92,3.92,4.2450,4,16.98,0.0,80.0,18.057850,0.194487,-0.004375,0.008571,-0.015484,-0.015484,-0.002209,3,-0.006627,7.0,64.5,0.000104,0.009978,-0.000418
7353,999823,4.72,4.45,4.59,4.6060,5,23.03,0.0,91.0,21.223180,0.089129,-0.000659,0.007778,-0.010385,-0.001154,0.000476,4,0.001906,4.5,78.0,0.000050,0.007086,-0.000122
7354,999863,1.27,1.27,1.27,1.2700,1,1.27,0.0,0.0,1.612900,0.000000,0.000000,1.270000,1.270000,1.270000,1.270000,1,1.270000,0.0,0.0,1.612900,0.000000,0.000000
7355,999880,1.97,1.78,1.78,1.8650,4,7.46,0.0,70.0,3.482950,0.068739,-0.002714,0.000857,-0.009286,-0.004286,-0.004238,3,-0.012714,7.0,59.5,0.000035,0.004141,0.000095


In [None]:
DF_total.to_csv('/content/drive/MyDrive/ML Project/Cleaned data/fvc.csv')

# SVC (Slow Vital Capacity)
+ Slow vital capacity is the maximum volume of air that can be exhaled slowly after slow maximum inhalation.
+ SVC is typically greater than FVC

In [None]:
DF_svc  = pd.read_csv(zf.open('Svc.csv'))
DF_svc

Unnamed: 0,subject_id,Subject_Liters_Trial_1,pct_of_Normal_Trial_1,Subject_Liters__Trial_2_,pct_of_Normal_Trial_2,Subject_Liters__Trial_3_,pct_of_Normal_Trial_3,Subject_Normal,Slow_vital_Capacity_Delta,Slow_Vital_Capacity_Units
0,89,3.29,,,,,,3.97,,
1,348,3.00,83.0,,,,,,0.0,
2,348,3.00,81.0,,,,,,4.0,
3,348,3.00,81.0,,,,,,61.0,
4,348,2.00,78.0,,,,,,120.0,
...,...,...,...,...,...,...,...,...,...,...
9520,998661,1.88,,,,,,3.40,,
9521,998661,1.62,,,,,,3.53,,
9522,999075,3.43,,,,,,4.93,,
9523,999670,2.85,,,,,,4.17,,


In [None]:
DF_sub = DF_svc[DF_svc['Slow_vital_Capacity_Delta'] <= 92]
DF_sub = DF_sub.drop(['Slow_Vital_Capacity_Units','Subject_Normal','pct_of_Normal_Trial_1','Subject_Liters__Trial_2_','pct_of_Normal_Trial_2','Subject_Liters__Trial_3_','pct_of_Normal_Trial_3'], axis=1)
DF_sub = DF_sub[DF_sub['Subject_Liters_Trial_1'].notna() & DF_sub['Slow_vital_Capacity_Delta'].notna()]
DF_sub = DF_sub.drop(index=DF_sub[DF_sub.duplicated(subset=['subject_id','Slow_vital_Capacity_Delta'], keep="last")].index)
DF_sub

Unnamed: 0,subject_id,Subject_Liters_Trial_1,Slow_vital_Capacity_Delta
1,348,3.00,0.0
2,348,3.00,4.0
3,348,3.00,61.0
18,1064,3.00,0.0
19,1064,3.00,11.0
...,...,...,...
9504,996808,3.00,88.0
9514,997090,2.43,0.0
9515,997090,2.53,14.0
9516,997090,2.43,42.0


In [None]:
def svc_timeseries_preprocess(DF_sub):
  data_set_name = 'svc'
  content = 'liters'
  DF_total = pd.DataFrame()
  for subjectId in DF_sub['subject_id'].unique():
    # print(subjectId)
    DF_sub_data = DF_sub[DF_sub['subject_id'] == subjectId]
    DF_sub_value = np.array(DF_sub_data['Subject_Liters_Trial_1'])
    DF_sub_delta = np.array(DF_sub_data['Slow_vital_Capacity_Delta'])
    # print(DF_sub_data)
    # Calculate derivative
    DF_sub_derivative = pd.DataFrame(columns=['subject_id','Subject_Liters_Trial_1','Slow_vital_Capacity_Delta'])
    if len(DF_sub_data) > 1:
      for i in range(len(DF_sub_data)-1):
        if DF_sub_data.iloc[i+1,2] == DF_sub_data.iloc[i,2]:
          pass 
        else:
          subject_id = DF_sub_data.iloc[0,0]
          measurement_values = DF_sub_data.iloc[i+1,1] - DF_sub_data.iloc[i,1]
          delta_values_gap = DF_sub_data.iloc[i+1,2] - DF_sub_data.iloc[i,2]
          delta_values = (DF_sub_data.iloc[i+1,2] + DF_sub_data.iloc[i,2])/2
          DF_sub_derivative.loc[i] = [subject_id, measurement_values/delta_values_gap, delta_values]
    else:
      DF_sub_derivative = DF_sub_data

    DF_sub_slope_value = np.array(DF_sub_derivative['Subject_Liters_Trial_1'])
    DF_sub_slope_delta = np.array(DF_sub_derivative['Slow_vital_Capacity_Delta'])
    # print(DF_sub_slope_delta, DF_sub_delta)
    DF_sub_delta_gap = 0
    DF_sub_slope_delta_gap = 0
    if (DF_sub_delta[-1]==DF_sub_delta[0]):
      DF_sub_delta_gap = 0
    else:
      DF_sub_delta_gap = (DF_sub_value[-1]-DF_sub_value[0])/(DF_sub_delta[-1]-DF_sub_delta[0])

    if (len(DF_sub_slope_delta)<=1):
      DF_sub_slope_delta_gap = 0
    else:
      DF_sub_slope_delta_gap = (DF_sub_slope_value[-1]-DF_sub_slope_value[0])/(DF_sub_slope_delta[-1]-DF_sub_slope_delta[0])

    DF_sub_processed = pd.DataFrame.from_dict({
        'subject_id': subjectId,
        '{0}_{1}_ValueMax'.format(data_set_name,content): DF_sub_value.max(),
        '{0}_{1}_ValueMin'.format(data_set_name,content): DF_sub_value.min(),
        '{0}_{1}_ValueLast'.format(data_set_name,content): DF_sub_value[-1],
        '{0}_{1}_ValueMean'.format(data_set_name,content): np.mean(DF_sub_value),
        '{0}_{1}_ValueNumber'.format(data_set_name,content): len(DF_sub_value),
        '{0}_{1}_valueSum'.format(data_set_name,content): DF_sub_value.sum(),
        '{0}_{1}_DeltaFirst'.format(data_set_name,content): DF_sub_delta[0],
        '{0}_{1}_DeltaLast'.format(data_set_name,content): DF_sub_delta[-1],
        '{0}_{1}_ValueMeanSquare'.format(data_set_name,content): np.square(DF_sub_value).mean(),
        '{0}_{1}_ValueMean'.format(data_set_name,content): DF_sub_value.std(),
        '{0}_{1}_DeltaSlope'.format(data_set_name,content): [DF_sub_delta_gap], 
        '{0}_{1}_SlopeMax'.format(data_set_name,content): DF_sub_slope_value.max(),
        '{0}_{1}_SlopeMin'.format(data_set_name,content): DF_sub_slope_value.min(),
        '{0}_{1}_SlopeLast'.format(data_set_name,content): DF_sub_slope_value[-1],
        '{0}_{1}_SlopeMean'.format(data_set_name,content): np.mean(DF_sub_slope_value),
        '{0}_{1}_SlopeNumber'.format(data_set_name,content): len(DF_sub_slope_value),
        '{0}_{1}_SlopeSum'.format(data_set_name,content): DF_sub_slope_value.sum(),
        '{0}_{1}_SlopeDeltaFirst'.format(data_set_name,content): DF_sub_slope_delta[0],
        '{0}_{1}_SlopeDeltaLast'.format(data_set_name,content): DF_sub_slope_delta[-1],
        '{0}_{1}_SlopeMeanSquare'.format(data_set_name,content): np.square(DF_sub_slope_value).mean(),
        '{0}_{1}_SlopeMean'.format(data_set_name,content): DF_sub_slope_value.std(),
        '{0}_{1}_SlopeDeltaSlope'.format(data_set_name,content):[DF_sub_slope_delta_gap], 
    })
    DF_total = DF_total.append(DF_sub_processed,ignore_index=True)
    # print("DF_sub_processed",DF_sub_processed)

  return DF_total

In [None]:
DF_total = svc_timeseries_preprocess(DF_sub)
DF_total

348
1064
4877
5918
7799
9641
10956
11941
14840
17035
18470
19574
20626
22303
23939
24570
25028
28045
28386
29543
29710
30116
31375
31657
34152
34312
34957
38700
38943
40173
40657
40874
41875
41878
43715
43725
45160
45164
45781
48616
49938
50249
50342
51753
52187
52510
53301
54014
55570
56618
57877
59405
60104
60843
60883
61366
61505
64812
65627
68949
70175
70833
76092
76548
77643
79303
79845
82514
82665
84635
87638
88857
93998
94103
96891
98762
100981
102107
103471
103569
103844
104179
107935
108342
108830
109272
109864
110295
110416
110956
112610
112700
113250
114523
115339
116067
117340
118770
119554
120896
121268
121416
121870
125143
128776
130492
130620
134476
137890
138063
138119
139808
140126
141533
144179
146598
149252
152453
153138
154140
154153
154414
155441
155671
156455
156721
156980
157887
159018
162752
165883
170640
172526
172538
173311
174017
174667
176608
177566
177649
178164
178807
179679
181358
183177
183623
185175
185588
186849
188226
190553
192709
193443
193817
19442

Unnamed: 0,subject_id,svc_liters_ValueMax,svc_liters_ValueMin,svc_liters_ValueLast,svc_liters_ValueMean,svc_liters_ValueNumber,svc_liters_valueSum,svc_liters_DeltaFirst,svc_liters_DeltaLast,svc_liters_ValueMeanSquare,svc_liters_DeltaSlope,svc_liters_SlopeMax,svc_liters_SlopeMin,svc_liters_SlopeLast,svc_liters_SlopeMean,svc_liters_SlopeNumber,svc_liters_SlopeSum,svc_liters_SlopeDeltaFirst,svc_liters_SlopeDeltaLast,svc_liters_SlopeMeanSquare,svc_liters_SlopeDeltaSlope
0,348,3.00,3.00,3.00,0.000000,3,9.00,0.0,61.0,9.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,2.0,32.5,0.000000,0.000000
1,1064,3.00,2.00,2.00,0.471405,3,8.00,0.0,69.0,7.333333,-0.014493,0.000000,-0.017241,-0.017241,0.008621,2,-0.017241,5.5,40.0,0.000149,-0.000500
2,4877,3.00,3.00,3.00,0.000000,3,9.00,0.0,71.0,9.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,4.0,39.5,0.000000,0.000000
3,5918,5.00,5.00,5.00,0.000000,3,15.00,0.0,86.0,25.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,11.5,54.5,0.000000,0.000000
4,7799,3.51,3.43,3.43,0.033993,3,10.43,0.0,50.0,12.088367,-0.001600,-0.000909,-0.002143,-0.002143,0.000617,2,-0.003052,11.0,36.0,0.000003,-0.000049
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,995893,2.00,2.00,2.00,0.000000,3,6.00,0.0,85.0,4.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,10.5,53.0,0.000000,0.000000
691,996611,4.81,4.64,4.64,0.074087,3,14.12,0.0,49.0,22.158200,-0.000612,0.009333,-0.005000,-0.005000,0.007167,2,0.004333,7.5,32.0,0.000056,-0.000585
692,996711,3.00,3.00,3.00,0.000000,3,9.00,0.0,71.0,9.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,4.0,39.5,0.000000,0.000000
693,996808,3.00,3.00,3.00,0.000000,3,9.00,0.0,88.0,9.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0.000000,13.5,57.5,0.000000,0.000000


In [None]:
DF_total.to_csv('/content/drive/MyDrive/ML Project/Cleaned data/svc.csv')