In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Part 1: Load and Explore the Data
# Load the dataset
data = pd.read_csv('DB_Voice_Features.csv')

# Display basic information about the dataset
print("Dataset Overview:")
print(data.info())
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Define features (X) and target (y)
X = data.drop(columns=['name', 'status'])  # Dropping 'name' and 'status' from features
y = data['status']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Part 2: Apply Machine Learning Algorithms
# Initialize models
models = {
    'k-NN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'LDA': LinearDiscriminantAnalysis()
}

# Display initial performance before optimization
print("\n--- Part (a) Initial Model Performance ---")
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Display the results
    print(f"\nModel: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

# Part 3: Apply Parameter Optimization using GridSearchCV
# Define parameter grids for each model
param_grids = {
    'k-NN': {'n_neighbors': [3, 5, 7, 9, 11]},
    'Logistic Regression': {'C': [0.01, 0.1, 1, 10, 100]},
    'LDA': {'solver': ['svd', 'lsqr', 'eigen']}
}

# Display performance after parameter optimization
print("\n--- Part (b) Performance After Parameter Optimization ---")
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model from grid search
    best_model = grid_search.best_estimator_
    y_pred_optimized = best_model.predict(X_test)
    
    # Display the results
    print(f"\nOptimized Model: {name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy after Optimization: {accuracy_score(y_test, y_pred_optimized):.4f}")
    print("Confusion Matrix after Optimization:")
    print(confusion_matrix(y_test, y_pred_optimized))
    print("Classification Report after Optimization:")
    print(classification_report(y_test, y_pred_optimized))

# Part 4: Analyze the Confusion Matrix
# Comparing confusion matrices before and after optimization
print("\n--- Part (c) Confusion Matrix Analysis ---")
for name, model in models.items():
    # Before optimization
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\nConfusion Matrix for {name} Before Optimization:")
    print(confusion_matrix(y_test, y_pred))
    
    # After optimization
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred_optimized = best_model.predict(X_test)
    
    print(f"Confusion Matrix for {name} After Optimization:")
    print(confusion_matrix(y_test, y_pred_optimized))


Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-