In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics
from sklearn.metrics import accuracy_score 

import pickle
import joblib

df=pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')
df.tail()

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,anat_cnr,anat_efc,anat_fber,anat_fwhm,anat_qi1,anat_snr
1107,1108,51583,1108,51583,SBL,SBL_0051583,1,3.899774,1.697271,3.465151,3.31883,0.096813,5.43454
1108,1109,51584,1109,51584,SBL,SBL_0051584,1,2.757735,1.073076,7.633618,3.30937,0.104817,4.51625
1109,1110,51585,1110,51585,SBL,SBL_0051585,1,3.413469,1.358238,4.3357,3.32455,0.10949,4.93396
1110,1111,51606,1111,51606,MAX_MUN,MaxMun_a_0051606,1,7.839007,1.754363,12.270055,3.23217,0.083964,16.403174
1111,1112,51607,1112,51607,MAX_MUN,MaxMun_a_0051607,1,12.169297,2.818352,9.272107,3.51019,0.04431,23.565982


In [2]:
df.describe()
print('Shape of initial dataset:')
print(df.shape)

#Checking for null values to fill
print("Number of null values:")
print(df.isnull().sum())

Shape of initial dataset:
(1112, 13)
Number of null values:
Unnamed: 0     0
SUB_ID         0
X              0
subject        0
SITE_ID        0
FILE_ID        0
DX_GROUP       0
anat_cnr      13
anat_efc      13
anat_fber     13
anat_fwhm     13
anat_qi1      13
anat_snr      13
dtype: int64


In [3]:
#Dropping empty columns
df['anat_cnr'].replace('', np.nan, inplace=True)
df['anat_efc'].replace('', np.nan, inplace=True)
df['anat_fber'].replace('', np.nan, inplace=True)
df['anat_fwhm'].replace('', np.nan, inplace=True)
df['anat_qi1'].replace('', np.nan, inplace=True)
df['anat_snr'].replace('', np.nan, inplace=True)

#Replacing null values in all relevant input columns
df.dropna(subset=['anat_cnr','anat_efc', 'anat_fber', 'anat_fwhm', 'anat_qi1', 'anat_snr'], inplace=True)

#Verifying number of null rows
print("Number of null values:")
print(df.isnull().sum())

Number of null values:
Unnamed: 0    0
SUB_ID        0
X             0
subject       0
SITE_ID       0
FILE_ID       0
DX_GROUP      0
anat_cnr      0
anat_efc      0
anat_fber     0
anat_fwhm     0
anat_qi1      0
anat_snr      0
dtype: int64


In [4]:
# Removing rows with negative values

df=df[df.select_dtypes(include=[np.number]).ge(0).all(1)]

In [5]:
# Split dataset into training set and test set
X=df[['anat_cnr','anat_efc', 'anat_fber', 'anat_fwhm', 'anat_qi1', 'anat_snr']]
y=df['DX_GROUP']
X_train,X_test,y_train,y_test=train_test_split(X.values,y,test_size = 0.33, random_state = 65)

#Create model
model= MultinomialNB()

model.fit(X_train, y_train)

y_pred=model.predict(X_test)

In [6]:
print(metrics.classification_report(y_test, model.predict(X_test)))

print(accuracy_score(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           1       0.46      0.29      0.35       149
           2       0.56      0.72      0.63       184

    accuracy                           0.53       333
   macro avg       0.51      0.51      0.49       333
weighted avg       0.51      0.53      0.51       333

0.5285285285285285
