In [2]:
import pandas as pd
import numpy as np

In [3]:
df_users = pd.read_csv('users.csv')
df_taps = pd.read_csv('taps.csv')

df_users.head()


Unnamed: 0,ID,BirthYear,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levodopa,DA,MAOB,Other
0,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False
1,0JIWJUQ4ZN,1949.0,Male,False,False,,,Don't know,,False,False,False,False
2,0NPYYFR8TU,,Male,False,False,,,Don't know,,False,False,False,False
3,0OGFC1ENAR,1949.0,Male,False,False,,,Don't know,,False,False,False,False
4,0QAZFRHQHW,1930.0,Female,True,False,2011.0,,Don't know,Mild,True,False,False,False


In [4]:
df_taps.head()

Unnamed: 0,ID,Date,TimeStamp,Hand,HoldTime,Direction,LatencyTime,FlightTime
0,0EA27ICBLF,160722.0,18:41:04.336,L,101.6,LL,234.4,156.3
1,0EA27ICBLF,160722.0,18:42:14.070,L,85.9,LL,437.5,359.4
2,0EA27ICBLF,160722.0,18:42:14.273,L,78.1,LL,210.9,125.0
3,0EA27ICBLF,160722.0,18:42:14.617,L,62.5,LL,359.4,281.3
4,0EA27ICBLF,160722.0,18:42:15.586,S,125.0,LS,187.5,93.8


In [5]:
# Merge the two datasets by ID column
df_merged = pd.merge(df_users, df_taps, on='ID', how='inner')
df_merged.head()

Unnamed: 0,ID,BirthYear,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levodopa,DA,MAOB,Other,Date,TimeStamp,Hand,HoldTime,Direction,LatencyTime,FlightTime
0,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False,160722.0,18:41:04.336,L,101.6,LL,234.4,156.3
1,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False,160722.0,18:42:14.070,L,85.9,LL,437.5,359.4
2,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False,160722.0,18:42:14.273,L,78.1,LL,210.9,125.0
3,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False,160722.0,18:42:14.617,L,62.5,LL,359.4,281.3
4,0EA27ICBLF,1952.0,Female,True,True,2000.0,Left,Don't know,Severe,True,True,False,False,160722.0,18:42:15.586,S,125.0,LS,187.5,93.8


In [6]:
# Basic EDA on the merged dataset
print("Dataset Shape:", df_merged.shape)
print("\nDataset Info:")
print(df_merged.info())

print("\nBasic Statistics:")
print(df_merged.describe())

print("\nMissing Values:")
print(df_merged.isnull().sum())

print("\nUnique values in categorical columns:")
categorical_cols = ['Gender', 'Parkinsons', 'Tremors', 'Sided', 'UPDRS', 'Impact', 'Hand', 'Direction']
for col in categorical_cols:
    if col in df_merged.columns:
        print(f"{col}: {df_merged[col].nunique()} unique values")
        print(df_merged[col].value_counts())
        print("-" * 50)

Dataset Shape: (39543408, 20)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39543408 entries, 0 to 39543407
Data columns (total 20 columns):
 #   Column         Dtype  
---  ------         -----  
 0   ID             object 
 1   BirthYear      float64
 2   Gender         object 
 3   Parkinsons     object 
 4   Tremors        object 
 5   DiagnosisYear  float64
 6   Sided          object 
 7   UPDRS          object 
 8   Impact         object 
 9   Levodopa       object 
 10  DA             object 
 11  MAOB           object 
 12  Other          object 
 13  Date           float64
 14  TimeStamp      object 
 15  Hand           object 
 16  HoldTime       float64
 17  Direction      object 
 18  LatencyTime    float64
 19  FlightTime     float64
dtypes: float64(6), object(14)
memory usage: 5.9+ GB
None

Basic Statistics:
          BirthYear  DiagnosisYear          Date      HoldTime   LatencyTime  \
count  3.309358e+07   2.387726e+07  3.954341e+07  3.954341e+07  3.9

In [7]:
# Create a subset of the merged dataset for faster processing
# Let's take a random sample of 10,000 rows (you can adjust this number as needed)
df_subset = df_merged.sample(n=10000, random_state=42)

print(f"Original dataset shape: {df_merged.shape}")
print(f"Subset dataset shape: {df_subset.shape}")
print("\nSubset dataset info:")
print(df_subset.info())

Original dataset shape: (39543408, 20)
Subset dataset shape: (10000, 20)

Subset dataset info:
<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 26227429 to 29853145
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             10000 non-null  object 
 1   BirthYear      8341 non-null   float64
 2   Gender         9989 non-null   object 
 3   Parkinsons     9989 non-null   object 
 4   Tremors        9989 non-null   object 
 5   DiagnosisYear  6007 non-null   float64
 6   Sided          5990 non-null   object 
 7   UPDRS          9989 non-null   object 
 8   Impact         6787 non-null   object 
 9   Levodopa       9989 non-null   object 
 10  DA             9989 non-null   object 
 11  MAOB           9989 non-null   object 
 12  Other          9989 non-null   object 
 13  Date           10000 non-null  float64
 14  TimeStamp      10000 non-null  object 
 15  Hand           10000 non-null  object 

In [8]:
df_subset.head()

Unnamed: 0,ID,BirthYear,Gender,Parkinsons,Tremors,DiagnosisYear,Sided,UPDRS,Impact,Levodopa,DA,MAOB,Other,Date,TimeStamp,Hand,HoldTime,Direction,LatencyTime,FlightTime
26227429,QAH9IVALVC,1942.0,Female,True,True,2015.0,Right,Don't know,Medium,True,False,False,False,170508.0,20:18:25.281,L,46.9,LL,156.3,78.1
39103199,ZVGQ2EFVXC,1959.0,Male,True,True,2011.0,Right,Don't know,Medium,True,True,False,False,170430.0,22:44:30.805,S,187.5,LS,359.4,281.3
30864847,SM5XQVQ2F9,1967.0,Female,False,False,,,Don't know,,False,False,False,False,171206.0,17:07:08.176,R,195.3,RR,164.1,46.9
4956478,927ZLCPEJM,1947.0,Male,True,True,2013.0,Right,Don't know,Medium,False,True,True,False,170602.0,10:40:00.238,R,160.2,LR,273.4,191.4
14009115,GRPUBQLE4F,1941.0,Female,True,True,2000.0,Left,Don't know,Mild,True,False,False,True,170109.0,11:44:08.047,R,93.8,RR,218.8,117.2


In [9]:
df_subset.drop(columns=['UPDRS'], inplace=True)

In [10]:
df_subset["Hand"].value_counts() #Must be label encoded [0, 1, 2]
df_subset["Impact"].value_counts() #Must be label encoded [0, 1, 2]
df_subset["Direction"].value_counts() #Must be label encoded [0 to 8]
df_subset["Gender"].value_counts() #Must be one hot encoded [0, 1, 2] as few are null values
df_subset["Tremors"].value_counts() #Must be one hot encoded [0, 1, 2] as few are null values
df_subset["Parkinsons"].value_counts() #the entire row where this column is null must be dropped

Parkinsons
True     6825
False    3164
Name: count, dtype: int64

In [11]:
df_subset.isnull().sum()

ID                  0
BirthYear        1659
Gender             11
Parkinsons         11
Tremors            11
DiagnosisYear    3993
Sided            4010
Impact           3213
Levodopa           11
DA                 11
MAOB               11
Other              11
Date                0
TimeStamp           0
Hand                0
HoldTime            0
Direction           0
LatencyTime         0
FlightTime          0
dtype: int64

In [12]:
# Drop rows where Parkinsons is null (as planned)
df_subset = df_subset.dropna(subset=['Parkinsons'])

# Handle other null values based on their nature:

# 1. BirthYear - fill with median or drop (many nulls)
df_subset['BirthYear'].fillna(df_subset['BirthYear'].median(), inplace=True)

# 2. Gender - fill with mode or create 'Unknown' category
df_subset['Gender'].fillna('Unknown', inplace=True)

# 3. Tremors - should be consistent with Parkinsons diagnosis
# Fill based on Parkinsons status
df_subset.loc[df_subset['Tremors'].isnull() & (df_subset['Parkinsons'] == True), 'Tremors'] = 'Unknown'
df_subset.loc[df_subset['Tremors'].isnull() & (df_subset['Parkinsons'] == False), 'Tremors'] = False

# 4. DiagnosisYear, Sided, Impact - only relevant for Parkinsons patients
# Fill with 'N/A' or appropriate values for non-Parkinsons patients
df_subset.loc[df_subset['Parkinsons'] == False, ['DiagnosisYear', 'Sided', 'Impact']] = np.nan
df_subset['Sided'].fillna('N/A', inplace=True)
df_subset['Impact'].fillna('N/A', inplace=True)
df_subset['DiagnosisYear'].fillna('N/A', inplace=True)

# 5. Medication columns (Levodopa, DA, MAOB, Other) - fill with False for missing
medication_cols = ['Levodopa', 'DA', 'MAOB ', 'Other']
for col in medication_cols:
    if col in df_subset.columns:
        df_subset[col].fillna(False, inplace=True)
    else:
        print(f"Column '{col}' not found in dataset")

print(f"Dataset shape after handling null values: {df_subset.shape}")
print("Remaining null values:")
print(df_subset.isnull().sum())

Dataset shape after handling null values: (9989, 19)
Remaining null values:
ID               0
BirthYear        0
Gender           0
Parkinsons       0
Tremors          0
DiagnosisYear    0
Sided            0
Impact           0
Levodopa         0
DA               0
MAOB             0
Other            0
Date             0
TimeStamp        0
Hand             0
HoldTime         0
Direction        0
LatencyTime      0
FlightTime       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_subset['BirthYear'].fillna(df_subset['BirthYear'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_subset['Gender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

In [13]:
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9989 entries, 26227429 to 29853145
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             9989 non-null   object 
 1   BirthYear      9989 non-null   float64
 2   Gender         9989 non-null   object 
 3   Parkinsons     9989 non-null   object 
 4   Tremors        9989 non-null   object 
 5   DiagnosisYear  9989 non-null   object 
 6   Sided          9989 non-null   object 
 7   Impact         9989 non-null   object 
 8   Levodopa       9989 non-null   object 
 9   DA             9989 non-null   object 
 10  MAOB           9989 non-null   object 
 11  Other          9989 non-null   object 
 12  Date           9989 non-null   float64
 13  TimeStamp      9989 non-null   object 
 14  Hand           9989 non-null   object 
 15  HoldTime       9989 non-null   float64
 16  Direction      9989 non-null   object 
 17  LatencyTime    9989 non-null   float64
 18  Fl

In [18]:
df_subset["Hand"]#Must be label encoded [0, 1, 2]
df_subset["Impact"]#Must be label encoded [0, 1, 2]
df_subset["Direction"]#Must be label encoded [0 to 8]
df_subset["Gender"]#Must be one hot encoded [0, 1, 2] as few are null values
df_subset["Tremors"]#Must be one hot encoded [0, 1, 2] as few are null values
df_subset[["Other", "Levodopa", "DA", "MAOB "]]#Must be label encoded [0, 1]
#all other "object" type columns must be labbel encoded

Unnamed: 0,Other,Levodopa,DA,MAOB
26227429,False,True,False,False
39103199,False,True,True,False
30864847,False,False,False,False
4956478,False,False,True,True
14009115,True,True,False,False
...,...,...,...,...
22383218,False,False,True,False
10012224,True,False,False,False
28303252,True,True,False,False
14106815,True,True,False,False


In [21]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

df_merged.drop(columns=['UPDRS'], inplace=True)

# Drop rows where Parkinsons is null (as planned)
df_merged = df_merged.dropna(subset=['Parkinsons'])

# Handle other null values based on their nature:

# 1. BirthYear - fill with median or drop (many nulls)
df_merged['BirthYear'].fillna(df_merged['BirthYear'].median(), inplace=True)

# 2. Gender - fill with mode or create 'Unknown' category
df_merged['Gender'].fillna('Unknown', inplace=True)

# 3. Tremors - should be consistent with Parkinsons diagnosis
# Fill based on Parkinsons status
df_merged.loc[df_merged['Tremors'].isnull() & (df_merged['Parkinsons'] == True), 'Tremors'] = 'Unknown'
df_merged.loc[df_merged['Tremors'].isnull() & (df_merged['Parkinsons'] == False), 'Tremors'] = False

# 4. DiagnosisYear, Sided, Impact - only relevant for Parkinsons patients
# Fill with 'N/A' or appropriate values for non-Parkinsons patients
df_merged.loc[df_merged['Parkinsons'] == False, ['DiagnosisYear', 'Sided', 'Impact']] = np.nan
df_merged['Sided'].fillna('N/A', inplace=True)
df_merged['Impact'].fillna('N/A', inplace=True)
df_merged['DiagnosisYear'].fillna('N/A', inplace=True)

# 5. Medication columns (Levodopa, DA, MAOB, Other) - fill with False for missing
medication_cols = ['Levodopa', 'DA', 'MAOB ', 'Other']
for col in medication_cols:
    if col in df_merged.columns:
        df_merged[col].fillna(False, inplace=True)
    else:
        print(f"Column '{col}' not found in dataset")

print(f"Dataset shape after handling null values: {df_merged.shape}")
print("Remaining null values:")
print(df_merged.isnull().sum())

# Initialize label encoders
le_hand = LabelEncoder()
le_impact = LabelEncoder()
le_direction = LabelEncoder()
le_parkinsons = LabelEncoder()
le_sided = LabelEncoder()
le_diagnosis_year = LabelEncoder()
le_id = LabelEncoder()
le_timestamp = LabelEncoder()

# Label encode categorical columns
df_merged['Hand_encoded'] = le_hand.fit_transform(df_merged['Hand'])
df_merged['Impact_encoded'] = le_impact.fit_transform(df_merged['Impact'])
df_merged['Direction_encoded'] = le_direction.fit_transform(df_merged['Direction'])
df_merged['Parkinsons_encoded'] = le_parkinsons.fit_transform(df_merged['Parkinsons'])
df_merged['Sided_encoded'] = le_sided.fit_transform(df_merged['Sided'])

# Label encode medication columns (convert boolean to 0/1)
df_merged['Levodopa_encoded'] = df_merged['Levodopa'].astype(int)
df_merged['DA_encoded'] = df_merged['DA'].astype(int)
df_merged['MAOB_encoded'] = df_merged['MAOB '].astype(int)  # Note the space in column name
df_merged['Other_encoded'] = df_merged['Other'].astype(int)

# One-hot encode Gender and Tremors
gender_dummies = pd.get_dummies(df_merged['Gender'], prefix='Gender')
tremors_dummies = pd.get_dummies(df_merged['Tremors'], prefix='Tremors')

# Combine encoded features with original numeric columns
df_encoded = pd.concat([
    df_merged[['BirthYear', 'Date', 'HoldTime', 'LatencyTime', 'FlightTime']],
    df_merged[['Hand_encoded', 'Impact_encoded', 'Direction_encoded', 'Parkinsons_encoded',
               'Sided_encoded', 'Levodopa_encoded', 'DA_encoded', 'MAOB_encoded', 'Other_encoded']],
    gender_dummies,
    tremors_dummies
], axis=1)

print("Encoded dataset shape:", df_encoded.shape)
print("\nEncoded dataset columns:")
print(df_encoded.columns.tolist())
df_encoded.head()



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['BirthYear'].fillna(df_merged['BirthYear'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_merged['Gender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate ob

Dataset shape after handling null values: (39503664, 19)
Remaining null values:
ID               0
BirthYear        0
Gender           0
Parkinsons       0
Tremors          0
DiagnosisYear    0
Sided            0
Impact           0
Levodopa         0
DA               0
MAOB             0
Other            0
Date             0
TimeStamp        0
Hand             0
HoldTime         0
Direction        0
LatencyTime      0
FlightTime       0
dtype: int64
Encoded dataset shape: (39503664, 18)

Encoded dataset columns:
['BirthYear', 'Date', 'HoldTime', 'LatencyTime', 'FlightTime', 'Hand_encoded', 'Impact_encoded', 'Direction_encoded', 'Parkinsons_encoded', 'Sided_encoded', 'Levodopa_encoded', 'DA_encoded', 'MAOB_encoded', 'Other_encoded', 'Gender_Female', 'Gender_Male', 'Tremors_False', 'Tremors_True']


Unnamed: 0,BirthYear,Date,HoldTime,LatencyTime,FlightTime,Hand_encoded,Impact_encoded,Direction_encoded,Parkinsons_encoded,Sided_encoded,Levodopa_encoded,DA_encoded,MAOB_encoded,Other_encoded,Gender_Female,Gender_Male,Tremors_False,Tremors_True
0,1952.0,160722.0,101.6,234.4,156.3,0,3,0,1,0,1,1,0,0,True,False,False,True
1,1952.0,160722.0,85.9,437.5,359.4,0,3,0,1,0,1,1,0,0,True,False,False,True
2,1952.0,160722.0,78.1,210.9,125.0,0,3,0,1,0,1,1,0,0,True,False,False,True
3,1952.0,160722.0,62.5,359.4,281.3,0,3,0,1,0,1,1,0,0,True,False,False,True
4,1952.0,160722.0,125.0,187.5,93.8,2,3,2,1,0,1,1,0,0,True,False,False,True


In [22]:
df_encoded.to_csv('park_encoded.csv', index=False)