In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn import datasets  # In case we need additional preprocessing or comparison

# Define the path to your CSV file
csv_path = r"D:\python\ML\6\cell_samples.csv"  # Replace 'your_file.csv' with the actual filename

# Load the dataset from CSV
df = pd.read_csv(csv_path)

# Display the first 5 rows
df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Drop rows with missing values (if any)
df = df.dropna()

# Convert categorical data to numerical (if needed)
# Uncomment if your dataset has categorical columns
# df = pd.get_dummies(df)

# Separate features (X) and target (y)
X = df.iloc[:, :-1]  # All columns except the last one (features)
y = df.iloc[:, -1]   # Last column (target)

# Display dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Feature shape: {X.shape}, Target shape: {y.shape}")

Missing values per column:
 ID             0
Clump          0
UnifSize       0
UnifShape      0
MargAdh        0
SingEpiSize    0
BareNuc        0
BlandChrom     0
NormNucl       0
Mit            0
Class          0
dtype: int64
Dataset shape: (699, 11)
Feature shape: (699, 10), Target shape: (699,)


In [5]:
# Convert '?' values to NaN for processing
df.replace('?', np.nan, inplace=True)

# Convert all columns to numeric (forcing non-numeric to NaN)
df = df.apply(pd.to_numeric, errors='coerce')

# Fill missing values with the median of each column
df.fillna(df.median(), inplace=True)

# Separate features and target
X = df.iloc[:, :-1]  # All columns except the last one (features)
y = df.iloc[:, -1]   # Last column (target)

# Standardize features using StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for readability
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display the first 5 rows of the cleaned and standardized dataset
X_scaled_df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit
0,-0.116239,0.206936,-0.699995,-0.743299,-0.633247,-0.549561,-0.686979,-0.179662,-0.611825,-0.343912
1,-0.111504,0.206936,0.283845,0.266875,0.768621,1.710106,1.799664,-0.179662,-0.284112,-0.343912
2,-0.091265,-0.503866,-0.699995,-0.743299,-0.633247,-0.549561,-0.410685,-0.179662,-0.611825,-0.343912
3,-0.089884,0.562336,1.595632,1.613773,-0.633247,-0.097628,0.141902,-0.179662,1.354454,-0.343912
4,-0.088674,-0.148465,-0.699995,-0.743299,0.067687,-0.549561,-0.686979,-0.179662,-0.611825,-0.343912


In [6]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split dataset: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Print dataset shapes
print(f"Training set: X_train shape = {X_train.shape}, y_train shape = {y_train.shape}")
print(f"Testing set: X_test shape = {X_test.shape}, y_test shape = {y_test.shape}")

Training set: X_train shape = (559, 10), y_train shape = (559,)
Testing set: X_test shape = (140, 10), y_test shape = (140,)


In [7]:
# Import SVM classifier
from sklearn.svm import SVC

# Initialize the SVM model with RBF kernel
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

# Train the model
svm_classifier.fit(X_train, y_train)

# Print training completion message
print("SVM model trained successfully!")

SVM model trained successfully!


In [8]:
# Import evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Generate classification report
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Model Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Model Accuracy: 0.97

Confusion Matrix:
 [[94  1]
 [ 3 42]]

Classification Report:
               precision    recall  f1-score   support

           2       0.97      0.99      0.98        95
           4       0.98      0.93      0.95        45

    accuracy                           0.97       140
   macro avg       0.97      0.96      0.97       140
weighted avg       0.97      0.97      0.97       140



In [9]:
# Import joblib for saving the model
import joblib

# Save the trained model
joblib.dump(svm_classifier, "svm_model.pkl")

# Save the scaler (so new data can be standardized in the same way)
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved successfully!")

Model and scaler saved successfully!
