In [48]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [49]:
data = pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/BRCA.csv')
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0  FEMALE  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0  FEMALE -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0  FEMALE  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0  FEMALE  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0  FEMALE  0.221550   1.90680   0.52045 -0.311990   

  Tumour_Stage                      Histology ER status PR status HER2 status  \
0          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
1           II             Mucinous Carcinoma  Positive  Positive    Negative   
2          III  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
3           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   
4           II  Infiltrating Ductal Carcinoma  Positive  Positive    Negative   

                  Surgery_type Date_of_Surgery Date_of_Last_Visit  \
0  Mo

In [50]:
print(data.isnull().sum())
data=data.dropna()

Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64


In [51]:
print(data.Gender.value_counts())

FEMALE    313
MALE        4
Name: Gender, dtype: int64


In [52]:
stage = data["Tumour_Stage"].value_counts()
transactions = stage.index
quantity = stage.values

figure = px.pie(data, 
             values=quantity, 
             names=transactions, 
             title="Tumour Stages of Patients")
figure.show()

In [53]:
hist = data.Histology.value_counts()
types=hist.index
quantity=hist.values
fig=px.pie(data, values=quantity, names=types, title='Histology of Patients')
fig.show()

In [54]:
surgeries = data.Surgery_type.value_counts()
types=surgeries.index
quantity=surgeries.values
fig=px.pie(data, values=quantity, names=types, title='Types of Surgery of Patients')
fig.show()

In [55]:
data["Tumour_Stage"] = data["Tumour_Stage"].map({"I": 1, "II": 2, "III": 3})
data["Histology"] = data["Histology"].map({"Infiltrating Ductal Carcinoma": 1, 
                                           "Infiltrating Lobular Carcinoma": 2, "Mucinous Carcinoma": 3})
data["ER status"] = data["ER status"].map({"Positive": 1})
data["PR status"] = data["PR status"].map({"Positive": 1})
data["HER2 status"] = data["HER2 status"].map({"Positive": 1, "Negative": 2})
data["Gender"] = data["Gender"].map({"MALE": 0, "FEMALE": 1})
data["Surgery_type"] = data["Surgery_type"].map({"Other": 1, "Modified Radical Mastectomy": 2, "Lumpectomy": 3, "Simple Mastectomy": 4})
print(data.head())

     Patient_ID   Age  Gender  Protein1  Protein2  Protein3  Protein4  \
0  TCGA-D8-A1XD  36.0       1  0.080353   0.42638   0.54715  0.273680   
1  TCGA-EW-A1OX  43.0       1 -0.420320   0.57807   0.61447 -0.031505   
2  TCGA-A8-A079  69.0       1  0.213980   1.31140  -0.32747 -0.234260   
3  TCGA-D8-A1XR  56.0       1  0.345090  -0.21147  -0.19304  0.124270   
4  TCGA-BH-A0BF  56.0       1  0.221550   1.90680   0.52045 -0.311990   

   Tumour_Stage  Histology  ER status  PR status  HER2 status  Surgery_type  \
0             3          1          1          1            2             2   
1             2          3          1          1            2             3   
2             3          1          1          1            2             1   
3             2          1          1          1            2             2   
4             2          1          1          1            2             1   

  Date_of_Surgery Date_of_Last_Visit Patient_Status  
0       15-Jan-17          19-Ju

In [56]:
array = data.to_numpy()
X = array[:, 1:13]
Y=array[:, 15]
Xtrain, Xtest, Ytrain, Ytest=train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)
model = SVC()
model.fit(Xtrain, Ytrain)
predictions = model.predict(Xtest)
print(accuracy_score(Ytest, predictions))

0.828125


In [57]:
from sklearn.model_selection import cross_val_score
result = cross_val_score(model, Xtrain, Ytrain, cv=5, scoring='accuracy')
print(result.mean())

0.7984313725490196


In [58]:
from sklearn.model_selection import GridSearchCV
param_grid={'C':[.0001, .001, .01, .1, 1.0, 10.0, 100.00, 1000.0, 10000.0], 'class_weight':[None, 'balanced']}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1)
grid.fit(Xtrain, Ytrain)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                               1000.0, 10000.0],
                         'class_weight': [None, 'balanced']},
             verbose=1)

In [59]:
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

Best: 0.798431 using {'C': 0.0001, 'class_weight': None}


In [60]:
predictions = grid.predict(Xtest)
print(accuracy_score(predictions, Ytest))

0.828125
