In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
# from sklearn.externals import joblib

Importing csv files

In [3]:
dataset = pd.read_csv("Disease_Cleaned.csv")
dataset.drop(['Unnamed: 0'],axis=1, inplace=True)
dataset.head()

Unnamed: 0,Disease,redness_of_eyes,visual_disturbances,nausea,constipation,unsteadiness,altered_sensorium,continuous_feel_of_urine,spotting_ urination,mucoid_sputum,...,swollen_blood_vessels,excessive_hunger,throat_irritation,itching,skin_rash,belly_pain,swelled_lymph_nodes,acidity,red_sore_around_nose,dark_urine
0,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
4,Fungal infection,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


LabelEncoding Target variable

In [4]:
# Encoding target variable
le = LabelEncoder()
dataset["Disease"] = le.fit_transform(dataset["Disease"])
dataset["Disease"][0:20]


0     15
1     15
2     15
3     15
4     15
5     15
6     15
7     15
8     15
9     15
10     4
11     4
12     4
13     4
14     4
15     4
16     4
17     4
18     4
19     4
Name: Disease, dtype: int32

Extracting X,y and spliting it into training and testing set

In [5]:
y = dataset["Disease"]
X = dataset.drop("Disease", axis=1)
y.head()

0    15
1    15
2    15
3    15
4    15
Name: Disease, dtype: int32

In [31]:
#Spliting into test and trainging
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

Feature scaling using standard Scaler

In [32]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Hyper parameter optimize using RandomizedSearchCV

In [25]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [26]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X, y)

In [37]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

Training Classifer

In [40]:
# Custom evaluation param
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

classifier = RandomForestClassifier(n_estimators = 100, 
                                    criterion = 'entropy', 
                                    random_state = 42, 
                                    min_samples_split= 5, min_samples_leaf= 1, 
                                    max_features='sqrt',
                                    max_depth= 30,
                                    bootstrap=True)
classifier.fit(X_train, y_train)
accuracy = evaluate(classifier, X_test, y_test)
accuracy = classifier.score(X_test,y_test)
print(accuracy)

Model Performance
Average Error: 0.0000 degrees.
Accuracy = 100.00%.
1.0


Testing and predictions

In [35]:
y_pred = classifier.predict(X_test)

Decoding

In [36]:
predictions_test = le.inverse_transform(y_pred)
predictions_test

array(['Chronic cholestasis', 'Urinary tract infection', 'Tuberculosis',
       'Hyperthyroidism', 'Hyperthyroidism', 'Pneumonia', 'hepatitis A',
       'Malaria', 'AIDS', 'Peptic ulcer diseae', 'Hypothyroidism',
       'Cervical spondylosis', 'Osteoarthristis', 'Dengue', 'Dengue',
       'Pneumonia', 'Varicose veins', 'Dimorphic hemmorhoids(piles)',
       'Urinary tract infection', 'Common Cold',
       '(vertigo) Paroymsal  Positional Vertigo', 'Allergy', 'Impetigo',
       'Arthritis', 'Alcoholic hepatitis', 'Hypertension ', 'Typhoid',
       'Bronchial Asthma', 'Malaria', 'Common Cold', 'Varicose veins',
       'Hepatitis C', 'Bronchial Asthma', 'Gastroenteritis',
       'Gastroenteritis', 'Hypothyroidism', 'Heart attack',
       'Hypertension ', 'Hepatitis B', 'Common Cold',
       'Dimorphic hemmorhoids(piles)', 'Jaundice', 'Psoriasis',
       'Chicken pox', 'Alcoholic hepatitis', 'Hyperthyroidism',
       'Chronic cholestasis', 'Psoriasis', 'Hepatitis D', 'Dengue',
       'Drug

Confusion Matrix

In [46]:
# print(pd.crosstab(y_test, y_pred, rownames=['Actual Diseae'], colnames=['Predicted Diseae']))

Predicted Diseae  0   1   2   3   4   6   8   9   11  17  20  21  27  28  29  \
Actual Diseae                                                                  
0                 21   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
1                  0  21   3   0   0   0   0   0   0   0   0   0   0   0   0   
2                  0   0  23   0   0   0   0   0   0   0   0   0   0   0   0   
3                  0   0   0  25   0   0   0   0   0   2   0   0   0   0   0   
4                  0   0   7   0  23   0   0   0   0   0   0   0   0   0   0   
5                  0   0  23   0   0   0   0   0   0   0   0   0   0   0   0   
6                  0   2   1   0   0  27   0   0   0   0   0   0   0   0   0   
7                  0   0  21   0   0   0   0   0   0   0   0   0   0   0   0   
8                  0   0   0   0   0   0  27   0   0   0   0   0   0   0   0   
9                  0   0   0   0   0   0   0  21   0   0   0   0   0   0   0   
10                 0   5   0   0   0   0