# Import Required Libraries
Import the necessary libraries such as pandas, numpy, matplotlib, seaborn, and scikit-learn.

In [None]:
# Import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load Dataset
Load the dataset from 'assignment-2/yeast.csv' using pandas.

In [None]:
# Load Dataset
df = pd.read_csv('assignment-2/yeast.csv')

# Display the first few rows of the dataset
df.head()

# Data Preprocessing
Handle missing values, encode categorical variables, and normalize/standardize the data.

In [None]:
# Data Preprocessing

# Handle missing values
df = df.dropna()  # Drop rows with missing values

# Encode categorical variables
# Assuming 'class' is the categorical column to be encoded
df['class'] = df['class'].astype('category').cat.codes

# Normalize/Standardize the data
scaler = StandardScaler()
# Assuming all columns except the last one are features
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X_scaled = scaler.fit_transform(X)

# Update the DataFrame with scaled features
df.iloc[:, :-1] = X_scaled

# Display the first few rows of the preprocessed dataset
df.head()

# Exploratory Data Analysis
Perform EDA to understand the data distribution, relationships, and patterns using visualizations.

In [None]:
# Exploratory Data Analysis

# Display basic statistics of the dataset
df.describe()

# Plot histograms for each feature
df.hist(bins=15, figsize=(15, 10))
plt.suptitle('Histograms of Features')
plt.show()

# Plot a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Pairplot to visualize relationships between features
sns.pairplot(df)
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()

# Boxplot to visualize the distribution of features
plt.figure(figsize=(15, 10))
sns.boxplot(data=df)
plt.title('Boxplot of Features')
plt.xticks(rotation=90)
plt.show()

# Countplot to visualize the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(x=y)
plt.title('Distribution of Target Variable')
plt.show()

# Feature Engineering
Create new features or modify existing ones to improve model performance.

In [None]:
# Feature Engineering

# Create new features or modify existing ones to improve model performance

# Example: Create interaction terms between features
df['interaction_1'] = df.iloc[:, 0] * df.iloc[:, 1]
df['interaction_2'] = df.iloc[:, 2] * df.iloc[:, 3]

# Example: Create polynomial features
df['poly_1'] = df.iloc[:, 0] ** 2
df['poly_2'] = df.iloc[:, 1] ** 2

# Example: Create logarithmic features (adding a small constant to avoid log(0))
df['log_1'] = np.log(df.iloc[:, 0] + 1)
df['log_2'] = np.log(df.iloc[:, 1] + 1)

# Display the first few rows of the dataset with new features
df.head()

# Model Training
Train a machine learning model using scikit-learn. Split the data into training and testing sets.

In [None]:
# Model Training

# Split the data into training and testing sets
X = df.iloc[:, :-1].values  # Features
y = df.iloc[:, -1].values   # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Model Evaluation
Evaluate the model's performance using appropriate metrics and visualize the results.

In [None]:
# Model Evaluation

# Evaluate the model's performance using appropriate metrics and visualize the results

# Import necessary libraries for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print(f'Precision: {precision:.2f}')

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print(f'Recall: {recall:.2f}')

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1:.2f}')

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
print(f'ROC AUC Score: {roc_auc:.2f}')

# Plot ROC curve
fpr = {}
tpr = {}
thresh ={}

n_class = len(np.unique(y_test))

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, model.predict_proba(X_test)[:,i], pos_label=i)
    
plt.figure(figsize=(10, 8))
for i in range(n_class):
    plt.plot(fpr[i], tpr[i], linestyle='--', label=f'Class {i} vs Rest')
plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve')
plt.legend(loc='best')
plt.show()

# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()