In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create a synthetic dataset with missing values
np.random.seed(42)
X = np.random.randn(100, 10)
y = np.random.randint(0, 2, 100)

# Introduce missing values in the dataset
X[np.random.choice([True, False], size=X.shape, p=[0.1, 0.9])] = np.nan

# Convert to DataFrame for convenience
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(10)])
y = pd.Series(y, name='target')

print("Dataset with missing values:")
print(X.head())

# Method 1: Remove rows with missing values
X_dropped = X.dropna()
y_dropped = y[X_dropped.index]

print("\nDataset after removing rows with missing values:")
print(X_dropped.head())

# Method 2: Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print("\nDataset after imputing missing values with mean:")
print(X_imputed.head())

# Split the data into training and testing sets (using imputed dataset for further steps)
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Create and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy after handling missing values:", accuracy)


Dataset with missing values:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0   0.496714  -0.138264   0.647689        NaN  -0.234153  -0.234137   
1  -0.463418  -0.465730   0.241962  -1.913280  -1.724918  -0.562288   
2   1.465649  -0.225776   0.067528  -1.424748  -0.544383   0.110923   
3  -0.601707   1.852278  -0.013497  -1.057711   0.822545  -1.220844   
4   0.738467   0.171368  -0.115648  -0.301104  -1.478522        NaN   

   feature_6  feature_7  feature_8  feature_9  
0   1.579213   0.767435  -0.469474   0.542560  
1  -1.012831   0.314247  -0.908024  -1.412304  
2  -1.150994   0.375698  -0.600639  -0.291694  
3   0.208864  -1.959670  -1.328186   0.196861  
4  -0.460639   1.057122   0.343618  -1.763040  

Dataset after removing rows with missing values:
    feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
1   -0.463418  -0.465730   0.241962  -1.913280  -1.724918  -0.562288   
2    1.465649  -0.225776   0.067528  -1.424748  -0.544383   0.