In [23]:
import numpy as np
import pandas as pd
import vpython as python
from time import time
from IPython.display import display 

data = pd.read_csv('lungs.csv')
display(data)
records = len(data.index)

# Number of patients with Cancer
lungs_yes = len(data[data['Control/Cancer'] == 'YES'])

# Number of patients who donot have cancer
lungs_no = len(data[data['Control/Cancer'] == 'NO'])

# Percentage of cancer patients
yes_percent = float(lungs_yes) / records *100


Unnamed: 0,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score,...,gender,ethnicity,cancer,lung_cancer,contry_of_res,used_app_before,result,age_desc,relation,Control/Cancer
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,?,no,no,Egypt,no,2,'18 and more',?,NO
5,1,1,1,1,1,0,1,1,1,1,...,m,Others,yes,no,'United States',no,9,'18 and more',Self,YES
6,1,1,1,1,0,0,0,0,1,0,...,m,White-European,no,no,'New Zealand',no,5,'18 and more',Parent,NO
7,1,1,0,0,1,0,0,1,1,1,...,m,White-European,no,no,'United States',no,6,'18 and more',Self,NO
8,1,1,1,1,0,1,1,1,1,0,...,m,Asian,yes,yes,Bahamas,no,8,'18 and more','Health care professional',YES
9,1,1,1,1,1,1,1,1,1,1,...,m,White-European,no,no,'United States',no,10,'18 and more',Relative,YES


In [25]:
# Results


print "Total number of records: {}".format(records)
print "Individuals diagnosed with Cancer: {}".format(lungs_yes)
print "Individuals not diagnosed with Cancer: {}".format(lungs_no)
print "Percentage of individuals diagnosed with Cancer: {:.2f}%".format(yes_percent)
cancer_data = pd.read_csv('lungs.csv', na_values=['?'])
cancer_data.head(n=5)
cancer_data.loc[(cancer_data['age_desc'].isnull())
            |(cancer_data['relation'].isnull())| (cancer_data['result'].isnull())|(cancer_data['used_app_before'].isnull()) | (cancer_data['contry_of_res'].isnull()) | (cancer_data['lung_cancer'].isnull()) | (cancer_data['cancer'].isnull()) | (cancer_data['ethnicity'].isnull())| (cancer_data['gender'].isnull()) | (cancer_data['age'].isnull())]

cancer_data.dropna(inplace=True)
cancer_data.describe()
print(cancer_data.dtypes)

Total number of records: 704
Individuals diagnosed with Cancer: 188
Individuals not diagnosed with Cancer: 515
Percentage of individuals diagnosed with Cancer: 26.70%
L1_Score             int64
L2_Score             int64
L3_Score             int64
L4_Score             int64
L5_Score             int64
L6_Score             int64
L7_Score             int64
L8_Score             int64
L9_Score             int64
L10_Score            int64
age                float64
gender              object
ethnicity           object
cancer              object
lung_cancer         object
contry_of_res       object
used_app_before     object
result               int64
age_desc            object
relation            object
Control/Cancer      object
dtype: object


In [26]:

# Print the results
print "Total number of records: {}".format(records)
print "Individuals diagnosed with Cancer: {}".format(lungs_yes)
print "Individuals not diagnosed with Cancer: {}".format(lungs_no)
cancer_raw = cancer_data['Control/Cancer']
features_raw = cancer_data[['age', 'gender', 'cancer', 'lung_cancer', 'result','L1_Score','L2_Score','L3_Score','L4_Score','L5_Score','L6_Score','L7_Score','L8_Score',  'L9_Score','L10_Score']]


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numerical = ['age','result']


features_minmax_transform = pd.DataFrame(data = features_raw)
features_minmax_transform[numerical] = scaler.fit_transform(features_raw[numerical])
features_minmax_transform

# Show an example of a record with scaling applied
display(features_minmax_transform.head(n = 5))

Total number of records: 704
Individuals diagnosed with Cancer: 188
Individuals not diagnosed with Cancer: 515


Unnamed: 0,age,gender,cancer,lung_cancer,result,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score
0,0.02459,f,no,no,0.6,1,1,1,1,0,0,1,1,0,0
1,0.019126,m,no,yes,0.5,1,1,0,1,0,0,0,1,0,1
2,0.027322,m,yes,yes,0.8,1,1,0,1,1,0,1,1,1,1
3,0.04918,f,no,yes,0.6,1,1,0,1,0,0,1,1,0,1
5,0.051913,m,yes,no,0.9,1,1,1,1,1,0,1,1,1,1


In [27]:
#One-hot encode the 'features_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_minmax_transform)
display(features_final.head(5))


# Encode the 'all_classes_raw' data to numerical values
cancer_classes = cancer_raw.apply(lambda x: 1 if x == 'YES' else 0)



# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print "{} total features after one-hot encoding.".format(len(encoded))

# Uncomment the following line to see the encoded feature names
print encoded
from sklearn.model_selection import train_test_split

np.random.seed(1234)

X_train, X_test, y_train, y_test = train_test_split(features_final, cancer_classes, train_size=0.80, random_state=1)


# Show the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])


Unnamed: 0,age,result,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score,gender_f,gender_m,cancer_no,cancer_yes,lung_cancer_no,lung_cancer_yes
0,0.02459,0.6,1,1,1,1,0,0,1,1,0,0,1,0,1,0,1,0
1,0.019126,0.5,1,1,0,1,0,0,0,1,0,1,0,1,1,0,0,1
2,0.027322,0.8,1,1,0,1,1,0,1,1,1,1,0,1,0,1,0,1
3,0.04918,0.6,1,1,0,1,0,0,1,1,0,1,1,0,1,0,0,1
5,0.051913,0.9,1,1,1,1,1,0,1,1,1,1,0,1,0,1,1,0


18 total features after one-hot encoding.
['age', 'result', 'L1_Score', 'L2_Score', 'L3_Score', 'L4_Score', 'L5_Score', 'L6_Score', 'L7_Score', 'L8_Score', 'L9_Score', 'L10_Score', 'gender_f', 'gender_m', 'cancer_no', 'cancer_yes', 'lung_cancer_no', 'lung_cancer_yes']
Training set has 487 samples.
Testing set has 122 samples.


In [28]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

knn = neighbors.KNeighborsClassifier(n_neighbors=10)
cv_scores = cross_val_score(knn, features_final, cancer_classes, cv=10)

cv_scores.mean()

0.9753825136612022

In [29]:

from sklearn.model_selection import cross_val_score
cross_val_score(knn, features_final, cancer_classes, cv=10, scoring='roc_auc').mean()

0.9957098343213253

In [9]:
print(features_final)
knn.fit(X_train, y_train)


          age  result  L1_Score  L2_Score  L3_Score  L4_Score  L5_Score  \
0    0.024590     0.6         1         1         1         1         0   
1    0.019126     0.5         1         1         0         1         0   
2    0.027322     0.8         1         1         0         1         1   
3    0.049180     0.6         1         1         0         1         0   
5    0.051913     0.9         1         1         1         1         1   
6    0.128415     0.5         1         1         1         1         0   
7    0.032787     0.6         1         1         0         0         1   
8    0.000000     0.8         1         1         1         1         0   
9    0.043716     1.0         1         1         1         1         1   
12   0.002732     0.6         1         1         0         1         1   
13   0.038251     0.5         1         0         0         0         0   
15   0.054645     0.7         1         0         0         1         1   
16   0.103825     0.4    

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')

In [30]:
for n in range(10, 50):
    knn = neighbors.KNeighborsClassifier(n_neighbors=n)
    cv_scores = cross_val_score(knn, features_final, cancer_classes, cv=10)
    print (n, cv_scores.mean())


(10, 0.9753825136612022)
(11, 0.9704098360655738)
(12, 0.9687704918032786)
(13, 0.9737158469945355)
(14, 0.9720491803278689)
(15, 0.975327868852459)
(16, 0.967103825136612)
(17, 0.9703825136612021)
(18, 0.9687158469945356)
(19, 0.9670765027322403)
(20, 0.9654098360655737)
(21, 0.9670765027322403)
(22, 0.9654371584699453)
(23, 0.9654371584699453)
(24, 0.9637978142076502)
(25, 0.9654371584699453)
(26, 0.9671311475409835)
(27, 0.9687704918032786)
(28, 0.9671311475409835)
(29, 0.9687704918032786)
(30, 0.9704098360655738)
(31, 0.9687704918032786)
(32, 0.9704098360655738)
(33, 0.968743169398907)
(34, 0.9687704918032786)
(35, 0.967103825136612)
(36, 0.9671311475409835)
(37, 0.9687704918032786)
(38, 0.9687704918032786)
(39, 0.9654918032786884)
(40, 0.9671311475409835)
(41, 0.9654918032786884)
(42, 0.9671311475409835)
(43, 0.9687704918032786)
(44, 0.9687704918032786)
(45, 0.9671311475409835)
(46, 0.9654918032786884)
(47, 0.9638524590163934)
(48, 0.9622131147540983)
(49, 0.9622131147540983)


In [31]:
new_data=[[1,1,1,1,0,0,1,1,0,0,26,'f','no','no',6],
[1,1,0,1,0,0,0,1,0,1,24,'m','no','yes',5],
[1,1,0,1,1,0,1,1,1,1,27,'m','yes','yes',8],
[1,1,0,1,0,0,1,1,0,1,35,'f','no','yes',8],
[1,0,0,0,0,0,0,1,0,0,40,'f','no','no',2]]
new_d=pd.DataFrame(new_data,columns=['L1_Score','L2_Score','L3_Score','L4_Score','L5_Score','L6_Score','L7_Score','L8_Score','L9_Score','L10_Score','age','gender','cancer','lung_cancer','result'])

In [32]:
new_d

Unnamed: 0,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score,age,gender,cancer,lung_cancer,result
0,1,1,1,1,0,0,1,1,0,0,26,f,no,no,6
1,1,1,0,1,0,0,0,1,0,1,24,m,no,yes,5
2,1,1,0,1,1,0,1,1,1,1,27,m,yes,yes,8
3,1,1,0,1,0,0,1,1,0,1,35,f,no,yes,8
4,1,0,0,0,0,0,0,1,0,0,40,f,no,no,2


In [33]:
test_data

Unnamed: 0,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score,age,gender,cancer,lung_cancer,result
0,1,1,0,1,1,0,1,1,1,1,27,m,yes,yes,8


In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['age', 'result']
features_new = pd.DataFrame(data =new_d )
features_new[numerical] = scaler.fit_transform(features_new[numerical])
features_new
display(features_new.head(n = 5))


Unnamed: 0,L1_Score,L2_Score,L3_Score,L4_Score,L5_Score,L6_Score,L7_Score,L8_Score,L9_Score,L10_Score,age,gender,cancer,lung_cancer,result
0,1,1,1,1,0,0,1,1,0,0,0.125,f,no,no,0.666667
1,1,1,0,1,0,0,0,1,0,1,0.0,m,no,yes,0.5
2,1,1,0,1,1,0,1,1,1,1,0.1875,m,yes,yes,1.0
3,1,1,0,1,0,0,1,1,0,1,0.6875,f,no,yes,1.0
4,1,0,0,0,0,0,0,1,0,0,1.0,f,no,no,0.0


In [35]:
new_final=pd.get_dummies(features_new)
new_final.iloc[0]

L1_Score           1.000000
L2_Score           1.000000
L3_Score           1.000000
L4_Score           1.000000
L5_Score           0.000000
L6_Score           0.000000
L7_Score           1.000000
L8_Score           1.000000
L9_Score           0.000000
L10_Score          0.000000
age                0.125000
result             0.666667
gender_f           1.000000
gender_m           0.000000
cancer_no          1.000000
cancer_yes         0.000000
lung_cancer_no     1.000000
lung_cancer_yes    0.000000
Name: 0, dtype: float64

In [36]:
array1=new_final.iloc[3]
array1

L1_Score           1.0000
L2_Score           1.0000
L3_Score           0.0000
L4_Score           1.0000
L5_Score           0.0000
L6_Score           0.0000
L7_Score           1.0000
L8_Score           1.0000
L9_Score           0.0000
L10_Score          1.0000
age                0.6875
result             1.0000
gender_f           1.0000
gender_m           0.0000
cancer_no          1.0000
cancer_yes         0.0000
lung_cancer_no     0.0000
lung_cancer_yes    1.0000
Name: 3, dtype: float64

In [37]:
knn.fit(X_train, y_train)
val=knn.predict([array1])
if(val==1):
    print("Lung cancer")
else:
    print("No Cancer")

Lung cancer
