In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import seaborn as sns
from tkinter import *
from tkinter import messagebox
import sys 
import urllib
import urllib.request

In [None]:
# We have two datasets
# The first one has some set of symptoms and a particular disease against that
# The second one has a set of symptoms and a particular weightage factor against that, i.e how severe a symptom is
# We are putting them onto separate datasets
# Note that from our medictor django application this csv file will be auto updated from user suggestions at scheduled intervals

df = pd.read_csv('../input/disease-symptom-description-dataset/dataset.csv')
df1 = pd.read_csv('../input/disease-symptom-description-dataset/Symptom-severity.csv')

# Source Code for Django Project ::
https://github.com/GetAyanatGit/Medictor

In [None]:
# Lets get a count of the cells having null data

df.isna().sum()
df.isnull().sum()


In [None]:
# Lets have a look at the total counts
df.shape

In [None]:
# We can see that from symptom 6 onwards there are significant number of null values so we will ignore them

In [None]:
# Cleaning the data

cols = df.columns
data = df[cols].values.flatten()
s = pd.Series(data)
s = s.str.strip()
s = s.values.reshape(df.shape)
df = pd.DataFrame(s, columns=df.columns)
df = df.fillna(0)
df.tail()

In [None]:
# Note that earlier I mentioned that we have weighate against each symptom
# So we will simply perform an encoding operation here against each symptom

vals = df.values
symptoms = df1['Symptom'].unique()

for i in range(len(symptoms)):
    vals[vals == symptoms[i]] = df1[df1['Symptom'] == symptoms[i]]['weight'].values[0]
    
d = pd.DataFrame(vals, columns=cols)

# Weightage of these three aren't available in our dataset-2 hence as of now we are ignoring
d = d.replace('dischromic _patches', 0)
d = d.replace('spotting_ urination',0)
df = d.replace('foul_smell_of urine',0)

df.head()

In [None]:
# Now lets have a look at the different symptoms, we will need this list for option inputs in front-end
symptoms

In [None]:
(df[cols] == 0).all()

In [None]:
df['Disease'].value_counts()

In [None]:
# These may be referenced later from front end

df['Disease'].unique()

In [None]:
data = df.iloc[:,1:].values
data

In [None]:
# These are our Y in prediction (X,Y)
labels = df['Disease'].values
labels

In [None]:
# Train Test split is done from the dataset
x_train, x_test, y_train, y_test = train_test_split(data, labels, shuffle=True, train_size = 0.85)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

In [None]:
# We have chosen "SUPPORT_VECTOR_CLASSIFIER_MODEL" for this project 
##########################################################
model = SVC() # creating an instance of that model class
##########################################################

# Hyper-parameter tuning ::
#############################
# As of now kept blank
#############################

# Training the model ::
#############################
model.fit(x_train, y_train)
#############################

In [None]:
x_test

In [None]:
# Predicting using the test data ::

preds = model.predict(x_test)
preds

In [None]:
# Model Metrics (Accuracy and others) ::
conf_mat = confusion_matrix(y_test, preds)
df_cm = pd.DataFrame(conf_mat, index=df['Disease'].unique(), columns=df['Disease'].unique())
print('F1-score% =', f1_score(y_test, preds, average='macro')*100, '|', 'Accuracy% =', accuracy_score(y_test, preds)*100)
sns.heatmap(df_cm)

In [None]:
# Dump the data and save in to ".sav" file for use in ML based front-end applications (Optional)

# import pickle
# pickle.dump(pred_model,open("svc_ml_model.sav", "wb"))