In [1]:
import pandas as pd # Pandas DataFrame
import numpy as np # Numpy array manipulation
from sklearn.model_selection import KFold # k-fold cross validation
from sklearn.metrics import accuracy_score, f1_score # Evaluation metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer # Missing value imputation
from sklearn.decomposition import PCA #Principle Component Analysis

In [2]:
# Load data from CSV
training_data = pd.read_csv('train_feat.csv')

In [3]:
# Peek at the data
training_data.head()

Unnamed: 0,ID,Type,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,B00000,N,-5.0,8.0,3.0,0.0,0.0,3.0,0.077445,0.04176,...,0.153967,14.645908,2.897634,0.0,0.0,65.789474,0.0,73.619632,85.227273,0.92
1,B00001,N,-8.0,8.0,0.0,0.0,0.0,0.0,0.015513,0.017613,...,0.424268,9.350336,0.004927,0.0,0.0,58.365759,0.0,61.791967,64.102564,1.168
2,B00002,N,-8.0,8.0,0.0,0.0,0.0,0.0,0.011311,0.017416,...,0.188311,14.725601,2.380945,0.0,0.0,61.983471,0.0,62.860136,64.655172,0.984
3,B00003,~,25.0,1.0,26.0,0.0,0.0,7.0,0.517033,0.627826,...,0.117397,3.52568,-0.580148,29.0,0.0,66.225166,9.0,117.41683,197.368421,1.256
4,B00004,~,17.0,1.0,20.0,1.0,1.0,2.0,0.577912,0.803246,...,0.056104,4.242599,-1.011361,7.0,0.0,92.02454,6.0,124.223602,202.702703,1.096


In [4]:
# Make a duplicate of the training_data for use in 1st layer
training_data_l1 = training_data.copy()

In [5]:
# Change to 2 classes for 2 layer classification

# Other & Noisy together
training_data_l1.loc[training_data_l1['Type'] == 'O', 'Type'] = 0
training_data_l1.loc[training_data_l1['Type'] == 'N', 'Type'] = 0

# AF and noisy together
training_data_l1.loc[training_data_l1['Type'] == 'A', 'Type'] = 1
training_data_l1.loc[training_data_l1['Type'] == '~', 'Type'] = 1

In [6]:
# Peek at the data
training_data_l1.head()

Unnamed: 0,ID,Type,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,B00000,0,-5.0,8.0,3.0,0.0,0.0,3.0,0.077445,0.04176,...,0.153967,14.645908,2.897634,0.0,0.0,65.789474,0.0,73.619632,85.227273,0.92
1,B00001,0,-8.0,8.0,0.0,0.0,0.0,0.0,0.015513,0.017613,...,0.424268,9.350336,0.004927,0.0,0.0,58.365759,0.0,61.791967,64.102564,1.168
2,B00002,0,-8.0,8.0,0.0,0.0,0.0,0.0,0.011311,0.017416,...,0.188311,14.725601,2.380945,0.0,0.0,61.983471,0.0,62.860136,64.655172,0.984
3,B00003,1,25.0,1.0,26.0,0.0,0.0,7.0,0.517033,0.627826,...,0.117397,3.52568,-0.580148,29.0,0.0,66.225166,9.0,117.41683,197.368421,1.256
4,B00004,1,17.0,1.0,20.0,1.0,1.0,2.0,0.577912,0.803246,...,0.056104,4.242599,-1.011361,7.0,0.0,92.02454,6.0,124.223602,202.702703,1.096


In [7]:
# Split into labels (y) and input (X)

# Data from the 3rd feature column onwards are input
X = training_data_l1.values[:,2:]
# Classes/Labels are the type of AF
y = training_data_l1["Type"].values
# Label is a binary integer
y=y.astype('int')

In [8]:
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (13062, 188), y shape: (13062,)


In [9]:
# Deal with missing data

# Replace missing values with Nan
X[X == ''] = np.nan

# np.any(np.isnan(X))

# Replace Nan with median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)

In [10]:
pca = PCA(n_components=47)
pca.fit(X)
X = pca.transform(X)

In [11]:
# # Use scoring='f1_weighted’

# param_grid = {
#     'n_estimators': [100, 300, 500, 750, 800, 1200],
#     'criterion': ['gini', 'entropy']}

# search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='f1_weighted')
# search.fit(X, y)
# print("[INFO] grid search best parameters: {}".format(search.best_params_))

In [12]:
# random forest model creation
rfc = RandomForestClassifier(n_estimators=800, criterion='entropy')

# Train using KFold Cross Validation
kf = KFold(n_splits=5,shuffle=False)
kf.split(X)    
     
# Initialize the accuracy of the models to blank list. The accuracy of each model will be appended to this list
accuracy_model = []
f1_model = []
 
# Iterate over each train-test split
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = rfc.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)
    # Append to f1_model the f1 score of the model
    f1_model.append(f1_score(y_test, model.predict(X_test))*100)

In [13]:
print(accuracy_model)
print(f1_model)

[94.75698430922311, 93.64714887102947, 94.33384379785605, 94.21898928024503, 94.4104134762634]
[74.9542961608775, 63.436123348017624, 70.04048582995952, 69.49494949494948, 73.26007326007327]


In [14]:
predictions_from_first_layer = rfc.predict(X)
print(predictions_from_first_layer)

[0 0 0 ... 0 0 0]


In [15]:
# Check shape is correct
print(training_data.shape)
print(len(predictions_from_first_layer))

(13062, 190)
13062


In [16]:
# Split into two datasets for 2nd layer classification
training_data_l2_NaO = training_data.loc[(training_data['Type'] == 'N') | (training_data['Type'] == 'O')]
training_data_l2_AaN = training_data.loc[(training_data['Type'] == 'A') | (training_data['Type'] == '~')]

print(training_data_l2_NaO.shape)
print(training_data_l2_AaN.shape)
training_data.head()

(11578, 190)
(1484, 190)


Unnamed: 0,ID,Type,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,B00000,N,-5.0,8.0,3.0,0.0,0.0,3.0,0.077445,0.04176,...,0.153967,14.645908,2.897634,0.0,0.0,65.789474,0.0,73.619632,85.227273,0.92
1,B00001,N,-8.0,8.0,0.0,0.0,0.0,0.0,0.015513,0.017613,...,0.424268,9.350336,0.004927,0.0,0.0,58.365759,0.0,61.791967,64.102564,1.168
2,B00002,N,-8.0,8.0,0.0,0.0,0.0,0.0,0.011311,0.017416,...,0.188311,14.725601,2.380945,0.0,0.0,61.983471,0.0,62.860136,64.655172,0.984
3,B00003,~,25.0,1.0,26.0,0.0,0.0,7.0,0.517033,0.627826,...,0.117397,3.52568,-0.580148,29.0,0.0,66.225166,9.0,117.41683,197.368421,1.256
4,B00004,~,17.0,1.0,20.0,1.0,1.0,2.0,0.577912,0.803246,...,0.056104,4.242599,-1.011361,7.0,0.0,92.02454,6.0,124.223602,202.702703,1.096


In [17]:
# Binary class encoding
training_data_l2_NaO.loc[training_data_l2_NaO['Type'] == 'O', 'Type'] = 0
training_data_l2_NaO.loc[training_data_l2_NaO['Type'] == 'N', 'Type'] = 1
training_data_l2_NaO.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,ID,Type,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,B00000,1,-5.0,8.0,3.0,0.0,0.0,3.0,0.077445,0.04176,...,0.153967,14.645908,2.897634,0.0,0.0,65.789474,0.0,73.619632,85.227273,0.92
1,B00001,1,-8.0,8.0,0.0,0.0,0.0,0.0,0.015513,0.017613,...,0.424268,9.350336,0.004927,0.0,0.0,58.365759,0.0,61.791967,64.102564,1.168
2,B00002,1,-8.0,8.0,0.0,0.0,0.0,0.0,0.011311,0.017416,...,0.188311,14.725601,2.380945,0.0,0.0,61.983471,0.0,62.860136,64.655172,0.984
7,B00007,0,-4.0,11.0,7.0,0.0,1.0,4.0,0.174345,0.278268,...,0.113599,9.052018,1.816864,2.0,0.0,66.815145,3.0,79.681275,166.666667,0.992
8,B00008,0,-20.0,20.0,0.0,0.0,0.0,0.0,0.003003,0.004154,...,0.065263,10.29343,1.918406,1.0,0.0,69.444444,2.0,72.90401,202.702703,0.872


In [18]:
# Split into labels (y) and input (X)

# Data from the 3rd feature column onwards are input
X = training_data_l2_NaO.values[:,2:]
# Classes/Labels are the type of AF
y = training_data_l2_NaO["Type"].values
# Label is a binary integer
y=y.astype('int')

In [19]:
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (11578, 188), y shape: (11578,)


In [20]:
# Deal with missing data

# Replace missing values with Nan
X[X == ''] = np.nan

# np.any(np.isnan(X))

# Replace Nan with median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)

In [21]:
pca = PCA(n_components=47)
pca.fit(X)
X = pca.transform(X)

In [22]:
# random forest model creation
rfc_l2_NaO = RandomForestClassifier(n_estimators=800, criterion='entropy')

# Train using KFold Cross Validation
kf = KFold(n_splits=5,shuffle=False)
kf.split(X)    
     
# Initialize the accuracy of the models to blank list. The accuracy of each model will be appended to this list
accuracy_model = []
f1_model = []
 
# Iterate over each train-test split
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = rfc_l2_NaO.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)
    # Append to f1_model the f1 score of the model
    f1_model.append(f1_score(y_test, model.predict(X_test))*100)

In [23]:
print(accuracy_model)
print(f1_model)

[83.7651122625216, 83.98100172711571, 83.72193436960276, 83.41252699784017, 83.11015118790496]
[88.16120906801007, 88.49612403100775, 88.25179183546275, 88.0522713130056, 88.2617832482738]


In [24]:
# Binary class encoding
training_data_l2_AaN.loc[training_data_l2_AaN['Type'] == '~', 'Type'] = 0
training_data_l2_AaN.loc[training_data_l2_AaN['Type'] == 'A', 'Type'] = 1
training_data_l2_AaN.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,ID,Type,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
3,B00003,0,25.0,1.0,26.0,0.0,0.0,7.0,0.517033,0.627826,...,0.117397,3.52568,-0.580148,29.0,0.0,66.225166,9.0,117.41683,197.368421,1.256
4,B00004,0,17.0,1.0,20.0,1.0,1.0,2.0,0.577912,0.803246,...,0.056104,4.242599,-1.011361,7.0,0.0,92.02454,6.0,124.223602,202.702703,1.096
5,B00005,0,22.0,0.0,22.0,0.0,3.0,7.0,0.439205,0.620939,...,0.27335,6.135661,0.590243,14.0,0.0,47.84689,4.0,100.840336,197.368421,1.48
6,B00006,0,9.0,1.0,10.0,0.0,0.0,5.0,0.418228,0.591676,...,0.319184,7.349628,-0.104063,16.0,0.0,58.823529,3.0,114.394662,202.702703,1.48
9,B00009,0,16.0,0.0,16.0,0.0,0.0,3.0,0.272816,0.358785,...,0.362117,18.568379,-2.564082,5.0,0.0,57.251908,2.0,71.856287,202.702703,1.256


In [25]:
# Split into labels (y) and input (X)

# Data from the 3rd feature column onwards are input
X = training_data_l2_AaN.values[:,2:]
# Classes/Labels are the type of AF
y = training_data_l2_AaN["Type"].values
# Label is a binary integer
y=y.astype('int')

In [26]:
print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1484, 188), y shape: (1484,)


In [27]:
# Deal with missing data

# Replace missing values with Nan
X[X == ''] = np.nan

# np.any(np.isnan(X))

# Replace Nan with median
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X = imputer.fit_transform(X)

In [28]:
pca = PCA(n_components=47)
pca.fit(X)
X = pca.transform(X)

In [29]:
# random forest model creation
rfc_l2_AaN = RandomForestClassifier(n_estimators=800, criterion='entropy')

# Train using KFold Cross Validation
kf = KFold(n_splits=5,shuffle=False)
kf.split(X)    
     
# Initialize the accuracy of the models to blank list. The accuracy of each model will be appended to this list
accuracy_model = []
f1_model = []
 
# Iterate over each train-test split
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = rfc_l2_AaN.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True)*100)
    # Append to f1_model the f1 score of the model
    f1_model.append(f1_score(y_test, model.predict(X_test))*100)

In [30]:
print(accuracy_model)
print(f1_model)

[92.92929292929293, 89.56228956228956, 90.57239057239057, 91.24579124579124, 92.56756756756756]
[95.56025369978859, 93.15673289183222, 94.46640316205534, 94.39655172413792, 95.3781512605042]


In [31]:
# Load in test data

# Run through layer 1 classifier (rfc)

# Run resulting class label 0s through to rfc_l2_NaO

#Run resulting class label 1s through to rfc_l2_AaN

In [38]:
# load test data
unprocessed_test_data = pd.read_csv('test_feat.csv')
# Peek at the data
unprocessed_test_data.head()

Unnamed: 0,ID,F0,F1,F2,F3,F4,F5,F6,F7,F8,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,C00000,-1.0,6.0,5.0,0.0,0.0,3.0,0.068267,0.107653,0.110862,...,0.092296,27.957417,4.718421,0.0,0.0,53.956835,0.0,55.970149,70.093458,1.152
1,C00001,1.0,4.0,5.0,0.0,0.0,5.0,0.098701,0.164816,0.168081,...,0.082007,27.345269,4.703704,0.0,0.0,54.054054,0.0,56.550424,79.787234,1.16
2,C00002,-6.0,10.0,2.0,-1.0,1.0,3.0,0.126967,0.106857,0.059632,...,0.247711,10.088347,2.053699,2.0,0.0,65.934066,0.0,72.859745,80.645161,0.936
3,C00003,-3.0,6.0,3.0,0.0,0.0,3.0,0.077065,0.050373,0.057065,...,0.447685,13.327618,-0.482786,0.0,0.0,58.823529,0.0,65.323898,72.115385,1.088
4,C00004,-2.0,3.0,1.0,0.0,0.0,1.0,0.071738,0.055568,0.069081,...,0.155647,16.22495,3.086384,0.0,0.0,58.479532,0.0,63.124671,69.444444,1.088


In [33]:
# # Drop rows with missing values
# print(type(unprocessed_test_data))
# print(unprocessed_test_data.shape)
# processed_test_data = unprocessed_test_data.dropna()
# print(processed_test_data.shape)

In [34]:
# # Save rows we removed for later
# removed_rows = pd.concat([unprocessed_test_data,processed_test_data]).drop_duplicates(keep=False)
# removed_rows.head()

In [39]:
# Deal with missing data

processed_test_data = unprocessed_test_data.fillna(unprocessed_test_data.mean())

In [40]:


# Data from the 2nd feature column onwards are input
test_X = processed_test_data.values[:,1:]
test_ids = processed_test_data['ID']
print(test_X.shape)
print(test_ids.shape)


(4000, 188)
(4000,)


In [44]:
pca = PCA(n_components=47)
pca.fit(test_X)
test_X = pca.transform(test_X)

In [45]:
predictions_ontest_first_layer = rfc.predict(test_X)
print(predictions_ontest_first_layer)

[0 0 0 ... 0 0 0]


In [46]:
# Add predictions from the first layer NOT USED NOW WOULD DO THSI WHEN ACTUALLY TESTING IT
# As column in the orginal training data
processed_test_data.insert(1, "layer_1_class", predictions_ontest_first_layer, True) 
processed_test_data.head()

Unnamed: 0,ID,layer_1_class,F0,F1,F2,F3,F4,F5,F6,F7,...,F178,F179,F180,F181,F182,F183,F184,F185,F186,F187
0,C00000,0,-1.0,6.0,5.0,0.0,0.0,3.0,0.068267,0.107653,...,0.092296,27.957417,4.718421,0.0,0.0,53.956835,0.0,55.970149,70.093458,1.152
1,C00001,0,1.0,4.0,5.0,0.0,0.0,5.0,0.098701,0.164816,...,0.082007,27.345269,4.703704,0.0,0.0,54.054054,0.0,56.550424,79.787234,1.16
2,C00002,0,-6.0,10.0,2.0,-1.0,1.0,3.0,0.126967,0.106857,...,0.247711,10.088347,2.053699,2.0,0.0,65.934066,0.0,72.859745,80.645161,0.936
3,C00003,0,-3.0,6.0,3.0,0.0,0.0,3.0,0.077065,0.050373,...,0.447685,13.327618,-0.482786,0.0,0.0,58.823529,0.0,65.323898,72.115385,1.088
4,C00004,0,-2.0,3.0,1.0,0.0,0.0,1.0,0.071738,0.055568,...,0.155647,16.22495,3.086384,0.0,0.0,58.479532,0.0,63.124671,69.444444,1.088


In [47]:
# Split data set by result class label
test_data_l2_NaO = processed_test_data.loc[(processed_test_data['layer_1_class'] == 0)]
test_data_l2_AaN = processed_test_data.loc[(processed_test_data['layer_1_class'] == 1)]

print(test_data_l2_NaO.shape)
print(test_data_l2_AaN.shape)

(3953, 190)
(47, 190)


In [48]:
# Input is after 1st and 2nd columns
test_X_l2_NaO = test_data_l2_NaO.values[:,2:]
#record IDs
test_ids_l2_NaO = test_data_l2_NaO['ID']
print(test_X_l2_NaO.shape)
print(test_ids_l2_NaO.shape)

# Input is after 1st and 2nd columns
test_X_l2_AaN = test_data_l2_AaN.values[:,2:]
#record IDs
test_ids_l2_AaN = test_data_l2_AaN['ID']
print(test_X_l2_AaN.shape)
print(test_ids_l2_AaN.shape)


(3953, 188)
(3953,)
(47, 188)
(47,)


In [51]:
pca = PCA(n_components=47)
pca.fit(test_X_l2_NaO)
test_X_l2_NaO = pca.transform(test_X_l2_NaO)

In [52]:
# Pass first subset through 2nd layer classifiers
predictions_ontest_NaO_2nd_layer = rfc_l2_NaO.predict(test_X_l2_NaO)
print(predictions_ontest_NaO_2nd_layer)

[1 0 0 ... 0 1 1]


In [53]:
pca = PCA(n_components=47)
pca.fit(test_X_l2_AaN)
test_X_l2_AaN = pca.transform(test_X_l2_AaN)

In [54]:
# Pass second subset through 2nd layer classifiers
predictions_ontest_AaN_2nd_layer = rfc_l2_AaN.predict(test_X_l2_AaN)
print(predictions_ontest_AaN_2nd_layer)

[1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1]


In [55]:
#Merge predictions with IDs

NaO_preds = list(zip(test_ids_l2_NaO, predictions_ontest_NaO_2nd_layer))
print(NaO_preds)

AaN_preds = list(zip(test_ids_l2_AaN, predictions_ontest_AaN_2nd_layer))
print(AaN_preds)



[('C00000', 1), ('C00001', 0), ('C00002', 0), ('C00003', 1), ('C00004', 1), ('C00005', 1), ('C00006', 1), ('C00007', 1), ('C00008', 1), ('C00009', 1), ('C00010', 1), ('C00011', 1), ('C00012', 0), ('C00013', 0), ('C00014', 0), ('C00015', 1), ('C00017', 1), ('C00018', 1), ('C00019', 1), ('C00020', 1), ('C00021', 1), ('C00022', 1), ('C00023', 1), ('C00024', 1), ('C00025', 1), ('C00026', 1), ('C00027', 1), ('C00028', 1), ('C00029', 1), ('C00031', 1), ('C00032', 1), ('C00033', 1), ('C00034', 1), ('C00035', 1), ('C00036', 1), ('C00037', 1), ('C00038', 1), ('C00039', 1), ('C00040', 0), ('C00041', 1), ('C00042', 1), ('C00043', 1), ('C00044', 0), ('C00045', 0), ('C00046', 0), ('C00047', 1), ('C00048', 1), ('C00049', 1), ('C00050', 1), ('C00051', 0), ('C00052', 0), ('C00053', 1), ('C00054', 1), ('C00055', 1), ('C00056', 1), ('C00057', 1), ('C00058', 0), ('C00059', 0), ('C00060', 1), ('C00061', 1), ('C00062', 0), ('C00063', 0), ('C00064', 1), ('C00066', 0), ('C00067', 0), ('C00068', 1), ('C00069'

In [56]:
# Get Normal & Other predictions
predictions_NaO = pd.DataFrame(NaO_preds, columns=['ID', 'Encoding'])
predictions_NaO.head()

def decode_NaO(row):
     # 'O'==0, 'N'==1
    if row['Encoding'] == 0:
        return 'O'
    else:
        return 'N'

predictions_NaO['Predicted'] = predictions_NaO.apply(decode_NaO, axis=1)
predictions_NaO.head()
   

Unnamed: 0,ID,Encoding,Predicted
0,C00000,1,N
1,C00001,0,O
2,C00002,0,O
3,C00003,1,N
4,C00004,1,N


In [57]:
# Get AF & Noisy predictions
predictions_AaN = pd.DataFrame(AaN_preds, columns=['ID', 'Encoding'])
predictions_AaN.head()

def decode_AaN(row):
    # 'A'==1, '~'==0
    if row['Encoding'] == 0:
        return '~'
    else:
        return 'A'
    
predictions_AaN['Predicted'] = predictions_AaN.apply(decode_AaN, axis=1)
predictions_AaN.head()

Unnamed: 0,ID,Encoding,Predicted
0,C00016,1,A
1,C00030,1,A
2,C00065,1,A
3,C00169,0,~
4,C00453,1,A


In [58]:
# # Randonly guess the rows that were removed because of missing values
# removed_rows_predictions = pd.DataFrame(removed_rows, columns=['ID'])
# guesses1 = [1]*removed_rows_predictions.shape[0]
# removed_rows_predictions['Encoding'] = guesses1
# guesse2 = ['A']*removed_rows_predictions.shape[0]
# removed_rows_predictions['Predicted'] = guesse2


# removed_rows_predictions.head()

NameError: name 'removed_rows' is not defined

In [59]:
# Merge Dataframes into one
frames = [predictions_NaO, predictions_AaN]
predictions = pd.concat(frames)
print(predictions.shape)
predictions.head()

(4000, 3)


Unnamed: 0,ID,Encoding,Predicted
0,C00000,1,N
1,C00001,0,O
2,C00002,0,O
3,C00003,1,N
4,C00004,1,N


In [60]:
# Ensure Sorted by ID
predictions.sort_values(by=['ID'], inplace=True)
print(predictions)

          ID  Encoding Predicted
0     C00000         1         N
1     C00001         0         O
2     C00002         0         O
3     C00003         1         N
4     C00004         1         N
...      ...       ...       ...
3948  C03995         1         N
3949  C03996         1         N
3950  C03997         0         O
3951  C03998         1         N
3952  C03999         1         N

[4000 rows x 3 columns]


In [None]:
# We have dropped all rows with missing values 
# Therefore have 3999 rows instead of the original 4000
# WHAT TO DO WHEN FOR SUBMISSION??

In [61]:
# Drop endoding column
results = pd.DataFrame(predictions, columns=['ID', 'Predicted'])
# Write to file
results.to_csv("submission_RandomForest_GridSearch_mean_PCA.csv", index=False)