In [1]:
import os
from datetime import datetime
import time
import threading
import json
from kafka import KafkaProducer
from kafka.errors import KafkaError
# SKLearn libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_io as tfio

In [27]:
columns=["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size",
         "Bare Nuclei", "Bland Chromatin","Normal Nucleoli", "Mitoses", "Class"]

In [28]:
cancer_df = pd.read_csv('data/breast-cancer-wisconsin.data.csv', names= columns, nrows=70)
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           70 non-null     int64 
 1   Clump Thickness              70 non-null     int64 
 2   Uniformity of Cell Size      70 non-null     int64 
 3   Uniformity of Cell Shape     70 non-null     int64 
 4   Marginal Adhesion            70 non-null     int64 
 5   Single Epithelial Cell Size  70 non-null     int64 
 6   Bare Nuclei                  70 non-null     object
 7   Bland Chromatin              70 non-null     int64 
 8   Normal Nucleoli              70 non-null     int64 
 9   Mitoses                      70 non-null     int64 
 10  Class                        70 non-null     int64 
dtypes: int64(10), object(1)
memory usage: 6.1+ KB


In [29]:
cancer_df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [30]:
cancer_df["Class"].value_counts()

Class
2    35
4    35
Name: count, dtype: int64

In [31]:
cancer_df['Class'] = cancer_df['Class'].replace(2,0)
cancer_df['Class'] = cancer_df['Class'].replace(4,1)

In [32]:
train_df, test_df = train_test_split(cancer_df, test_size=0.4, shuffle=True)

print("Number of training samples: ",len(train_df))
print("Number of testing sample: ",len(test_df))

x_train_df = train_df.drop(["Class"], axis=1)
y_train_df = train_df["Class"]

x_test_df = test_df.drop(["Class"], axis=1)
y_test_df = test_df["Class"]

Number of training samples:  42
Number of testing sample:  28


In [33]:
x_train = list(filter(None, x_train_df.to_csv(index=False).split("\n")[1:]))
y_train = list(filter(None, y_train_df.to_csv(index=False).split("\n")[1:]))

x_test = list(filter(None, x_test_df.to_csv(index=False).split("\n")[1:]))
y_test = list(filter(None, y_test_df.to_csv(index=False).split("\n")[1:]))

In [34]:
x_train

['1033078,4,2,1,1,2,1,2,1,1',
 '1043999,1,1,1,1,2,3,3,1,1',
 '1118039,5,3,4,1,8,10,4,9,1',
 '1115282,5,3,5,5,3,3,4,10,1',
 '1049815,4,1,1,1,2,1,3,1,1',
 '1110503,5,5,5,8,10,8,7,3,7',
 '1120559,8,3,8,3,4,9,8,9,8',
 '1071760,2,1,1,1,2,1,3,1,1',
 '1113038,8,2,4,1,5,1,5,4,4',
 '1105524,1,1,1,1,2,1,2,1,1',
 '1091262,2,5,3,3,6,7,7,5,1',
 '1074610,2,1,1,2,2,1,3,1,1',
 '1017122,8,10,10,8,7,10,9,7,1',
 '1017023,4,1,1,3,2,1,3,1,1',
 '1041801,5,3,3,3,2,3,4,4,1',
 '1033078,2,1,1,1,2,1,1,1,5',
 '1108370,9,5,8,1,2,3,2,1,5',
 '1112209,8,10,10,1,3,6,3,9,1',
 '1081791,6,2,1,1,1,1,7,1,1',
 '1072179,10,7,7,3,8,5,7,4,3',
 '1002945,5,4,4,5,7,10,3,2,1',
 '1018561,2,1,2,1,2,1,3,1,1',
 '1099510,10,4,3,1,3,3,6,5,2',
 '1016277,6,8,8,1,3,4,3,7,1',
 '1079304,2,1,1,1,2,1,2,1,1',
 '1111249,10,6,6,3,4,5,3,6,1',
 '1116998,10,4,2,1,3,2,4,3,10',
 '1050670,10,7,7,6,4,10,4,1,2',
 '1035283,1,1,1,1,1,1,3,1,1',
 '1018099,1,1,1,1,2,10,3,1,1',
 '1116192,1,1,1,1,2,1,2,1,1',
 '1102573,5,6,5,6,10,1,3,1,1',
 '1015425,3,1,1,1,2,2,

In [38]:
def error_callback(exc):
      raise Exception('Error while sending data to kafka: {0}'.format(str(exc)))


def write_to_kafka(topic_name, items):
      count=0
      producer = KafkaProducer(bootstrap_servers=['127.0.0.1:9092'])
      for message, key in items:
        time.sleep(2)
        print(message.encode('utf-8'))
        print(key)
        producer.send(topic_name,
                      key=key.encode('utf-8'),
                      value=message.encode('utf-8')).add_errback(error_callback)
        count+=1
      producer.flush()
      # print(message, key)
      print("Wrote {0} messages into topic: {1}".format(count, topic_name))

write_to_kafka("cancer-train", zip(x_train, y_train))
write_to_kafka("cancer-test", zip(x_test, y_test))

b'1033078,4,2,1,1,2,1,2,1,1'
0
b'1043999,1,1,1,1,2,3,3,1,1'
0
b'1118039,5,3,4,1,8,10,4,9,1'
1
b'1115282,5,3,5,5,3,3,4,10,1'
1
b'1049815,4,1,1,1,2,1,3,1,1'
0
b'1110503,5,5,5,8,10,8,7,3,7'
1
b'1120559,8,3,8,3,4,9,8,9,8'
1
b'1071760,2,1,1,1,2,1,3,1,1'
0
b'1113038,8,2,4,1,5,1,5,4,4'
1
b'1105524,1,1,1,1,2,1,2,1,1'
0
b'1091262,2,5,3,3,6,7,7,5,1'
1
b'1074610,2,1,1,2,2,1,3,1,1'
0
b'1017122,8,10,10,8,7,10,9,7,1'
1
b'1017023,4,1,1,3,2,1,3,1,1'
0
b'1041801,5,3,3,3,2,3,4,4,1'
1
b'1033078,2,1,1,1,2,1,1,1,5'
0
b'1108370,9,5,8,1,2,3,2,1,5'
1
b'1112209,8,10,10,1,3,6,3,9,1'
1
b'1081791,6,2,1,1,1,1,7,1,1'
0
b'1072179,10,7,7,3,8,5,7,4,3'
1
b'1002945,5,4,4,5,7,10,3,2,1'
0
b'1018561,2,1,2,1,2,1,3,1,1'
0
b'1099510,10,4,3,1,3,3,6,5,2'
1
b'1016277,6,8,8,1,3,4,3,7,1'
0
b'1079304,2,1,1,1,2,1,2,1,1'
0
b'1111249,10,6,6,3,4,5,3,6,1'
1
b'1116998,10,4,2,1,3,2,4,3,10'
1
b'1050670,10,7,7,6,4,10,4,1,2'
1
b'1035283,1,1,1,1,1,1,3,1,1'
0
b'1018099,1,1,1,1,2,10,3,1,1'
0
b'1116192,1,1,1,1,2,1,2,1,1'
0
b'1102573,5,6,5,6,10,1