In [153]:
import numpy as np
import pandas as pd
import pickle
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

Reading the dataset

In [154]:
hair_data = pd.read_csv("Hairloss.csv")

In [155]:
hair_data.head()

Unnamed: 0,Chemotherapy_Regimen,Drug_Dosage(mg),Age,Gender,Hypertension,Family_History,Hair_Loss_Severity
0,ABVD,300,35,1,1,0,Partial
1,ABVD,500,60,0,1,1,Partial
2,ABVD,500,89,0,0,1,Complete
3,ABVD,400,45,1,1,1,Complete
4,CMF,100,22,0,0,0,No


In [156]:
hair_data.shape

(935, 7)

In [157]:
hair_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Chemotherapy_Regimen  935 non-null    object
 1   Drug_Dosage(mg)       935 non-null    int64 
 2   Age                   935 non-null    int64 
 3   Gender                935 non-null    int64 
 4   Hypertension          935 non-null    int64 
 5   Family_History        935 non-null    int64 
 6   Hair_Loss_Severity    935 non-null    object
dtypes: int64(5), object(2)
memory usage: 51.3+ KB


In [158]:
hair_data = hair_data.drop(columns="Gender")

In [159]:
print(hair_data['Chemotherapy_Regimen'].value_counts(), end='\n\n')
print(hair_data['Drug_Dosage(mg)'].value_counts(), end='\n\n')

hair_data['Hair_Loss_Severity'] = hair_data['Hair_Loss_Severity'].str.strip()


print(hair_data['Hair_Loss_Severity'].value_counts(), end='\n\n')

ABVD    361
CMF     294
BEP     280
Name: Chemotherapy_Regimen, dtype: int64

200    247
100    231
400    138
500    133
300     90
150     90
350      3
250      2
450      1
Name: Drug_Dosage(mg), dtype: int64

No          509
Partial     278
Complete    148
Name: Hair_Loss_Severity, dtype: int64



In [160]:
label_codes = {}
object_cols = ['Chemotherapy_Regimen', 'Hair_Loss_Severity']

In [161]:
#Chemotherapy_Regimen -> 0:ABVD, 1:BEP, 2:CMF
#Hair_Loss_Severity -> , 0:complete, 1:no, 2:partial

for col in object_cols:
  label_encode = LabelEncoder()
  hair_data[col] = label_encode.fit_transform(hair_data[col])
  label_codes[col] = label_encode

with open("Label_codes.pkl", 'wb') as f:
  pickle.dump(label_codes, f)

In [162]:
hair_data.head()

Unnamed: 0,Chemotherapy_Regimen,Drug_Dosage(mg),Age,Hypertension,Family_History,Hair_Loss_Severity
0,0,300,35,1,0,2
1,0,500,60,1,1,2
2,0,500,89,0,1,0
3,0,400,45,1,1,0
4,2,100,22,0,0,1


Create feature set X and target set Y

In [163]:
X = hair_data.drop(columns="Hair_Loss_Severity", axis=1)
Y = hair_data['Hair_Loss_Severity']


Using minmax scaler to standardize the values

In [164]:
ss = StandardScaler()
#X['Chemotherapy_Regimen'] = ss.fit_transform(X[['Chemotherapy_Regimen']])
#X['Drug_Dosage(mg)'] = ss.fit_transform(X[['Drug_Dosage(mg)']])
#X['Age'] = ss.fit_transform(X[['Age']])
X_scaled = X
X_scaled

Unnamed: 0,Chemotherapy_Regimen,Drug_Dosage(mg),Age,Hypertension,Family_History
0,0,300,35,1,0
1,0,500,60,1,1
2,0,500,89,0,1
3,0,400,45,1,1
4,2,100,22,0,0
...,...,...,...,...,...
930,0,400,80,0,1
931,1,200,16,0,1
932,2,500,18,1,1
933,0,300,20,1,1


Split into training and testing data

In [165]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.1, stratify=Y, random_state=42)

Training the classifier using SVC

In [166]:
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

Calculating the accuracy of the model with training data

In [167]:
max_data_accuracy = 0.0
X_train_prediction = classifier.predict(X_train)
svm_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f"Accuracy of model on training data: {svm_accuracy*100:.2f} %")

Accuracy of model on training data: 86.09 %


In [168]:
X_test_prediction = classifier.predict(X_test)
svm_accuracy = accuracy_score(X_test_prediction, Y_test)
print(f"Accuracy of model on test data: {svm_accuracy*100:.2f} %")

Accuracy of model on test data: 88.30 %


Check accuracy of Random Forest model

In [169]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, Y_train)

X_train_prediction = rf_model.predict(X_train)
rf_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f"Accuracy of model on training data: {rf_accuracy*100:.2f} %")

Accuracy of model on training data: 98.69 %


In [170]:
X_test_prediction = rf_model.predict(X_test)
rf_accuracy = accuracy_score(X_test_prediction, Y_test)
print(f"Accuracy of model on test data: {rf_accuracy*100:.2f} %")
max_data_accuracy = max(svm_accuracy*100, rf_accuracy*100)

Accuracy of model on test data: 90.43 %


In [171]:
print(f"The maximum accuracy of the 2 models is: {max_data_accuracy:.2f} %")

The maximum accuracy of the 2 models is: 90.43 %


Saving the trained model

In [172]:
if(max_data_accuracy == svm_accuracy):
  joblib.dump(classifier, 'saved_model.pkl')
else:
  joblib.dump(rf_model, 'saved_model.pkl')

['saved_model.pkl']

Downloading the saved model

In [173]:
from google.colab import files
files.download('saved_model.pkl')
files.download('Label_codes.pkl')