In [42]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pymongo

In [43]:
# Connect to MongoDB and load data
client = MongoClient('url')
db = client['ml_project_preprocessed']  # Create or connect to a database
collection = db['sklearn_breast_cancer_preprocessed']  # Create or connect to a collection
data = collection.find() 
data_list = list(data)
df = pd.DataFrame(data_list)

In [44]:
# Drop MongoDB's default '_id' column
df = df.drop(labels=['_id'], axis=1)

# Separate features and target
X = df.drop(labels=['target'], axis=1)
Y = df[['target']]


In [45]:
# Define numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, X.columns.tolist())
    ]
)

In [46]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

In [47]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)

# Transform the test data
X_test_transformed = preprocessor.transform(X_test)

In [48]:
# Convert transformed data back to DataFrame
feature_names = preprocessor.get_feature_names_out()
X_train = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test = pd.DataFrame(X_test_transformed, columns=feature_names)


In [49]:
# Display the first few rows
X_train.head(3)



Unnamed: 0,num_pipeline__mean radius,num_pipeline__mean texture,num_pipeline__mean smoothness,num_pipeline__mean compactness,num_pipeline__mean concavity,num_pipeline__mean symmetry,num_pipeline__mean fractal dimension,num_pipeline__radius error,num_pipeline__texture error,num_pipeline__smoothness error,num_pipeline__compactness error,num_pipeline__concavity error,num_pipeline__concave points error,num_pipeline__symmetry error,num_pipeline__fractal dimension error,num_pipeline__worst smoothness,num_pipeline__worst compactness,num_pipeline__worst concavity,num_pipeline__worst symmetry,num_pipeline__worst fractal dimension
0,-0.158032,1.262998,-0.107179,-0.380141,-0.029636,-0.716529,-0.581723,0.200904,0.152653,-0.391803,-0.610709,-0.19015,-0.329493,-0.712932,-0.591084,-0.261784,-0.388747,0.19498,-0.379281,-0.552506
1,-0.919404,-1.028057,-1.381996,-1.084917,-0.794176,-0.995625,-0.101365,-0.854119,-0.950339,-0.921428,-0.621712,-0.452777,-1.278622,-1.059075,-0.102634,-0.885775,-0.428762,-0.314498,-0.478646,0.386085
2,-0.811916,0.224687,3.367738,3.362908,1.976427,3.009768,4.84227,0.353123,-0.092904,0.671388,2.846662,0.929448,1.113915,4.969365,2.240851,3.322935,3.812789,1.924399,6.115303,4.725673


In [50]:
X_test.head(3)


Unnamed: 0,num_pipeline__mean radius,num_pipeline__mean texture,num_pipeline__mean smoothness,num_pipeline__mean compactness,num_pipeline__mean concavity,num_pipeline__mean symmetry,num_pipeline__mean fractal dimension,num_pipeline__radius error,num_pipeline__texture error,num_pipeline__smoothness error,num_pipeline__compactness error,num_pipeline__concavity error,num_pipeline__concave points error,num_pipeline__symmetry error,num_pipeline__fractal dimension error,num_pipeline__worst smoothness,num_pipeline__worst compactness,num_pipeline__worst concavity,num_pipeline__worst symmetry,num_pipeline__worst fractal dimension
0,1.17661,0.554238,-1.651991,-0.355475,0.281327,-0.109309,-1.325369,0.90049,0.180141,-0.504098,1.594596,1.096237,0.41605,0.139791,0.516813,-1.909119,-0.376242,-0.101359,-0.838642,-0.999683
1,0.480924,-0.037147,2.039651,2.468523,2.622919,2.172483,1.843878,0.449672,0.214958,0.743297,2.32603,2.523781,1.800231,4.438781,1.464786,0.96468,1.551337,1.925802,2.217249,1.101355
2,-1.220968,-0.418614,0.317517,-0.603267,-0.820165,0.343278,0.06249,-0.556688,1.481227,0.242238,-0.633295,-0.545643,-0.639325,0.089259,-0.416283,0.327779,-0.726996,-0.793127,0.016551,-0.410199


In [51]:
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
import numpy as np
def evaluate_model(true, predicted):
    conf_mat = confusion_matrix(true, predicted)
    acc_score = accuracy_score(true, predicted)
    class_report = classification_report(true, predicted)
    return conf_mat, acc_score, class_report

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


In [53]:
## Train multiple models
## Model Ecaluation
models={
    'LogisticRegression':LogisticRegression(),
    'KNeighborsClassifier':KNeighborsClassifier(),
    'SVC':SVC(),
    'GaussianNB':GaussianNB()
}
trained_model_list=[]
model_list=[]


for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    conf_mat, acc_score, class_report=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("confusion_matrix:",conf_mat)
    print("accuracy_score:",acc_score)
    print("classification_report",class_report)


    
    print('='*35)
    print('\n')



  y = column_or_1d(y, warn=True)
  return self._fit(X, y)


LogisticRegression
Model Training Performance
confusion_matrix: [[ 59   3]
 [  0 109]]
accuracy_score: 0.9824561403508771
classification_report               precision    recall  f1-score   support

           0       1.00      0.95      0.98        62
           1       0.97      1.00      0.99       109

    accuracy                           0.98       171
   macro avg       0.99      0.98      0.98       171
weighted avg       0.98      0.98      0.98       171



KNeighborsClassifier
Model Training Performance
confusion_matrix: [[ 57   5]
 [  3 106]]
accuracy_score: 0.9532163742690059
classification_report               precision    recall  f1-score   support

           0       0.95      0.92      0.93        62
           1       0.95      0.97      0.96       109

    accuracy                           0.95       171
   macro avg       0.95      0.95      0.95       171
weighted avg       0.95      0.95      0.95       171



SVC
Model Training Performance
confusion_matrix: [[ 

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [55]:
model_list

['LogisticRegression', 'KNeighborsClassifier', 'SVC', 'GaussianNB']

In [56]:
df.columns

Index(['mean radius', 'mean texture', 'mean smoothness', 'mean compactness',
       'mean concavity', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'smoothness error',
       'compactness error', 'concavity error', 'concave points error',
       'symmetry error', 'fractal dimension error', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst symmetry',
       'worst fractal dimension', 'target'],
      dtype='object')