In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.decomposition import PCA

df = pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')
df.head()
#cnr: CONTRAST TO NOISE RATIO, EFC:Overview of extension, SNR: Signal to Noise Ratio, 
#qi1: model-free quality index, fwhm:Full Width at Half Maximum, fber:fiber-track

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,anat_cnr,anat_efc,anat_fber,anat_fwhm,anat_qi1,anat_snr
0,1,50002,1,50002,PITT,no_filename,1,10.201539,1.194664,16.223458,3.878,0.152711,12.072452
1,2,50003,2,50003,PITT,Pitt_0050003,1,7.165701,1.126752,10.460008,4.282238,0.161716,9.241155
2,3,50004,3,50004,PITT,Pitt_0050004,1,7.698144,1.226218,9.72575,3.881684,0.174186,9.323463
3,4,50005,4,50005,PITT,Pitt_0050005,1,9.071807,1.256278,11.198226,3.628667,0.119269,10.8142
4,5,50006,5,50006,PITT,Pitt_0050006,1,8.026798,1.407166,6.282055,3.674539,0.130647,10.123574


In [2]:
df.drop(df.columns[[0,1,2,3,4,5,6]],axis=1,inplace=True)
df.head()

Unnamed: 0,anat_cnr,anat_efc,anat_fber,anat_fwhm,anat_qi1,anat_snr
0,10.201539,1.194664,16.223458,3.878,0.152711,12.072452
1,7.165701,1.126752,10.460008,4.282238,0.161716,9.241155
2,7.698144,1.226218,9.72575,3.881684,0.174186,9.323463
3,9.071807,1.256278,11.198226,3.628667,0.119269,10.8142
4,8.026798,1.407166,6.282055,3.674539,0.130647,10.123574


In [3]:
#Checking for null values to fill
print("Number of null values:")
print(df.isnull().sum())

Number of null values:
anat_cnr     13
anat_efc     13
anat_fber    13
anat_fwhm    13
anat_qi1     13
anat_snr     13
dtype: int64


In [4]:
#Checking number of unique values and wrong entries like symbols -,?,#,*,etc.
for col in df.columns:
    print('{} : {}'.format(col,df[col].unique()))

anat_cnr : [10.20153877  7.16570147  7.69814438 ...  3.41346894  7.83900677
 12.16929687]
anat_efc : [1.19466382 1.12675161 1.22621772 ... 1.35823764 1.75436261 2.81835195]
anat_fber : [16.22345825 10.4600076   9.72575046 ...  4.33569983 12.27005481
  9.27210712]
anat_fwhm : [3.8780004  4.28223801 3.88168429 ... 3.32455    3.23217    3.51019   ]
anat_qi1 : [0.15271098 0.16171559 0.17418572 ... 0.10948991 0.08396437 0.04430978]
anat_snr : [12.07245188  9.24115456  9.32346277 ...  4.93395956 16.4031739
 23.56598215]


In [5]:
print("Length before dropping null columns:")
print(len(df))
print("Length after dropping null columns:")
df=df.dropna()
print(len(df))

Length before dropping null columns:
1112
Length after dropping null columns:
1099


In [6]:
X = df.drop('DX_GROUP', axis=1).copy()
X.head()

KeyError: "['DX_GROUP'] not found in axis"

In [None]:
y = df['DX_GROUP'].copy()
y.head()

In [None]:
#Mean, std, and quartiles of dataset
df.describe()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

In [None]:
clf_svm = SVC(random_state=43)
clf_svm.fit(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(clf_svm,
                      X_test_scaled,
                      y_test,
                      values_format='d',
                      display_labels=["ASD","TD"], cmap='Blues')

In [None]:
param_grid = [
{'C': [0.5, 1, 10, 100],
 'gamma': ['scale', 1, 0.1, 0.01, 0.001, 0.0001],
 'kernel': ['rbf']},
]

optimal_params = GridSearchCV(
 SVC(),
 param_grid,
 cv=5,
 scoring='accuracy',
 verbose=0
)

optimal_params.fit(X_train_scaled, y_train)
print(optimal_params.best_params_)

In [None]:
clf_svm = SVC(random_state=43, C=1, gamma=0.1)
clf_svm.fit(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(clf_svm,
                      X_test_scaled,
                      y_test,
                      values_format='d',
                      display_labels=["ASD","TD"], cmap="Blues")

In [None]:
from sklearn.metrics import classification_report

y_predict = clf_svm.predict(X_test_scaled)
print(classification_report(y_test, y_predict))