In [None]:
# READ FROM CSV AND CREATE A DATA FRAME

import pandas as pd

df = pd.read_csv('./parkinsons.csv')
# print(df)
# df = df.sample(frac=1).reset_index(drop=True)
display(df)

In [None]:
df.isnull().sum()

In [None]:
# checking numerical features distribution

import matplotlib.pyplot as plt
import seaborn as sns

def plotCols(dataframe):
    totalPlots = len(dataframe.columns)
    plotColumns = totalPlots//7+1
    plt.figure(figsize = (20, plotColumns*2.5))
    plotnumber = 1
    for column in dataframe.columns:
        if plotnumber <= totalPlots:
            ax = plt.subplot(plotColumns, 7, plotnumber)
            # sns.distplot(dataframe[column])
            sns.histplot(dataframe[column])
            plt.xlabel(column)
            
        plotnumber += 1

    plt.tight_layout()
    plt.show()

plotCols(df)

In [None]:
# heatmap of data

plt.figure(figsize = (15, 8))

sns.heatmap(df.corr(), annot = True, linewidths = 1, linecolor = 'lightgrey')
plt.show()

In [None]:
df.columns

In [None]:
plotCols(df)

In [None]:
df.head()

In [None]:
ind_col = [col for col in df.columns if col != 'status']
dep_col = 'status'

x = df[ind_col]
y = df[dep_col]

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy="minority", random_state=42, k_neighbors=10, n_jobs=-1)
x, y = oversample.fit_resample(x, y)

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
# hyper parameter tuning of decision tree 

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

grid_param = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'max_depth' : [3, 5, 7, 10, 12, 15, 20],
    'min_samples_split' : [2, 3, 5, 7],
    'min_samples_leaf' : [1, 2, 3, 5, 7],
    'max_features' : ['sqrt', 'log2']
}

grid_search_dt = GridSearchCV(dt, grid_param, cv = 5, n_jobs = -1, verbose = 3)
grid_search_dt.fit(x_train, y_train)



# best parameters and best score
print(grid_search_dt.best_params_)
print(grid_search_dt.best_score_)
print(grid_search_dt.best_estimator_)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# dt = DecisionTreeClassifier(max_depth=5, max_features='log2', min_samples_split=7)
dt = grid_search_dt.best_estimator_
dt.fit(x_train, y_train)

# accuracy score, confusion matrix and classification report of decision tree

dt_acc = accuracy_score(y_test, dt.predict(x_test))

print(f"Training Accuracy of Decision Tree is {accuracy_score(y_train, dt.predict(x_train))}")
print(f"Test Accuracy of Decision Tree is {dt_acc} \n")

print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dt.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, dt.predict(x_test))}")

In [None]:
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# dt_acc = accuracy_score(y_test, dt.predict(x_test))

# print(f"Training Accuracy of Decision Tree is {accuracy_score(y_train, dt.predict(x_train))}")
# print(f"Testing Accuracy of Decision Tree is {dt_acc} \n")

# print(f"Confusion Matrix :- \n{confusion_matrix(y_test, dt.predict(x_test))}\n")
# print(f"Classification Report :- \n {classification_report(y_test, dt.predict(x_test))}")

In [None]:
# hyper parameter tuning of random forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt

rf = RandomForestClassifier()
rf.fit(x_train, y_train)


from sklearn.model_selection import GridSearchCV
grid_param = {
    'n_estimators' : [100, 120, 150, 200, 250],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [7, 10, 13, 15],
    'class_weight': ['balanced', 
                     'balanced_subsample'
                     ],
    # 'min_samples_leaf' : [1, 3, 5, 7],
    # 'min_samples_split' : [2, 3, 5, 7],
    'max_features' : ['sqrt', 'log2']
}

grid_search_rf = GridSearchCV(rf, grid_param, cv = 5, n_jobs = -1, verbose = 3)
grid_search_rf.fit(x_train, y_train)
# best parameters and best score
print(grid_search_rf.best_params_)
print(grid_search_rf.best_score_)
print(grid_search_rf.best_estimator_)

In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.tree import DecisionTreeClassifier

# dt = DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_split=7)
# dt.fit(x_train, y_train)
# dt_acc = accuracy_score(y_test, dt.predict(x_test))
# print(dt_acc)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
rf = RandomForestClassifier(class_weight='balanced_subsample', max_depth=10)
rf.fit(x_train, y_train)
print(f"Training Accuracy of Random Forest Classifier is {accuracy_score(y_train, rf.predict(x_train))}")
print(f"Test Accuracy of Random Forest Classifier is {accuracy_score(y_test, rf.predict(x_test))} \n")
print(f"Confusion Matrix :- \n{confusion_matrix(y_test, rf.predict(x_test))}\n")
print(f"Classification Report :- \n {classification_report(y_test, rf.predict(x_test))}")

In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_results = cross_val_score(dt, x, y, cv=kf)

print(f'Cross-Validation Results (Accuracy) DT: {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')

In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_results = cross_val_score(rf, x, y, cv=kf)

print(f'Cross-Validation Results (Accuracy): {cross_val_results}')
print(f'Mean Accuracy: {cross_val_results.mean()}')

In [None]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error

def error_report(model):
    mse = mean_squared_error(y_test, model.predict(x_test))
    print(f"Mean Squared Error: {mse}")
    rmse = root_mean_squared_error(y_test, model.predict(x_test))
    print(f"Root Mean Squared Error: {rmse}")
    mae = mean_absolute_error(y_test, model.predict(x_test))
    print(f"Mean Absolute Error: {mae}")

error_report(dt)
print()
error_report(rf)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./parkinsons-dt.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(dt, file)

In [None]:
# SAVING THE MODEL USING PICKLE PACKAGE
import pickle

# save the iris classification model as a pickle file
model_pkl_file = "./parkinsons-rf.pkl"

with open(model_pkl_file, 'wb') as file:  
    pickle.dump(rf, file)