Title: Introduction to Scikit-Learn & Machine Learning Models

Task 1: Installing and Setting Up Scikit-Learn

In [84]:
%pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Display keys to understand the structure
print(iris.keys())

# Features (X)
print("Feature data:\n", iris.data)

# Labels (y)
print("Target labels:\n", iris.target)

# Feature names
print("Feature names:\n", iris.feature_names)

# Target names (i.e., class labels)
print("Target names:\n", iris.target_names)


In [None]:
# Write your code from here
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Display keys to understand the structure
print(iris.keys())

# Features (X)
print("Feature data:\n", iris.data)

# Labels (y)
print("Target labels:\n", iris.target)

# Feature names
print("Feature names:\n", iris.feature_names)

# Target names (i.e., class labels)
print("Target names:\n", iris.target_names)


from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()

# Display keys to understand the structure
print(iris.keys())

# Features (X)
print("Feature data:\n", iris.data)

# Labels (y)
print("Target labels:\n", iris.target)

# Feature names
print("Feature names:\n", iris.feature_names)

# Target names (i.e., class labels)
print("Target names:\n", iris.target_names)


In [None]:
from sklearn.datasets import load_iris

# Load the dataset
iris = load_iris()

# Display the type
print(type(iris))  # <class 'sklearn.utils._bunch.Bunch'>

# Keys in the Bunch object
print(iris.keys())


Title: Building a Simple ML Model in Scikit-Learn

Task 1: Simple Linear Regression
Implement linear regression with a small dataset

In [None]:
from sklearn.datasets import load_iris

# Load the dataset
iris = load_iris()

# Display the type
print(type(iris))  # <class 'sklearn.utils._bunch.Bunch'>

# Keys in the Bunch object
print(iris.keys())


Task 2: Decision Tree Classifier
Build a decision tree model with the Iris dataset:

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Optional: Convert to DataFrame (for easier inspection)
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train the Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))

# Visualize the decision tree
plt.figure(figsize=(12, 8))
plot_tree(model, feature_names=iris.feature_names, class_names=iris.target_names, filled=True)
plt.title("Decision Tree Trained on Iris Dataset")
plt.show()


Task 3: K-Nearest Neighbors Classifier
Use the KNN algorithm on the digits dataset:

In [None]:
# Import required libraries
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Digits dataset
digits = load_digits()
X = digits.data
y = digits.target

# Optional: Visualize some digits
plt.figure(figsize=(10, 4))
for i in range(10):
    plt.subplot(2, 5, i + 1)
    plt.imshow(digits.images[i], cmap='gray')
    plt.title(f"Label: {digits.target[i]}")
    plt.axis('off')
plt.suptitle("Sample Digits from Dataset")
plt.show()

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train the KNN classifier (k=3 by default)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Title: Training a Classification Model

Task 1: Logistic Regression
Train a logistic regression model

In [None]:
# Import libraries
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Optional: Convert to DataFrame
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train the Logistic Regression model
model = LogisticRegression(max_iter=10000, solver='liblinear')  # use 'liblinear' for small datasets
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=data.target_names))


Task 2: Support Vector Machine
Train a Support Vector Classifier on the Iris dataset

In [None]:
# Import required libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Optional: Convert to DataFrame for inspection
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train the Support Vector Classifier
model = SVC(kernel='linear')  # You can also try 'rbf', 'poly', etc.
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))


Task 3: Naive Bayes Classifier
Train a Gaussian Naive Bayes model

In [None]:
# Import required libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Optional: Convert to DataFrame for inspection
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create and train the Gaussian Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=iris.target_names))


Title: Understanding Model Performance & Hyperparameter Tuning

Task 1: Using Confusion Matrix
Evaluate a model with a confusion matrix:

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a classifier (SVM example)
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=iris.target_names)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix for SVM Classifier on Iris Dataset")
plt.show()


Task 2: Cross-validation Score
Perform cross-validation with k-fold:

In [None]:
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create the model
model = SVC(kernel='linear', random_state=42)

# Perform 5-fold cross-validation
scores = cross_val_score(model, X, y, cv=5)

# Print cross-validation scores
print("Cross-validation scores for each fold:", scores)
print("Mean cross-validation accuracy:", scores.mean())


Task 3: Hyperparameter Tuning using Grid Search
Optimize hyperparameters using GridSearchCV

In [None]:
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split data (optional, for final evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Create the model
model = SVC()

# Define hyperparameters grid to search
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto']
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2)

# Fit GridSearch to training data
grid_search.fit(X_train, y_train)

# Best parameters found
print("Best hyperparameters:", grid_search.best_params_)

# Evaluate best estimator on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Classification report for best model:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
