In [2]:
pip install -U scikit-learn

Collecting scikit-learnNote: you may need to restart the kernel to use updated packages.

  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.1 MB 1.1 MB/s eta 0:00:10

In [5]:
# Code for Testing
import sklearn
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
X = [[1, 2, 3], [11, 12, 13]]
y = [0, 1]
clf.fit(X, y)
print(clf.predict([[4, 5, 6], [14, 15, 16]]))

[0 1]


# Load Dataset from iris

In [7]:
from sklearn.datasets import load_iris #buil-in Dataset of Scikit learn
idata=load_iris()
X = idata.data #Features(sepal length,sepal width,petal length,petal width)
y = idata.target # Labels (species of iris)

# Preprocess the Data

In [8]:
from sklearn.preprocessing import StandardScaler
# Standardize the features(mean=0, variance=1)
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [22]:
StandardScaler()

# Split The Data

In [13]:
from sklearn.model_selection import train_test_split
# split data into 70% for train and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Choose The Model

In [14]:
'''Classification: KNeighborsClassifier, LogisticRegression, RandomForestClassifier.

Regression: LinearRegression, Ridge, Lasso.

Clustering: KMeans, DBSCAN.'''

'Classification: KNeighborsClassifier, LogisticRegression, RandomForestClassifier.\n\nRegression: LinearRegression, Ridge, Lasso.\n\nClustering: KMeans, DBSCAN.'

In [15]:
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier with 3 neighbors

knn = KNeighborsClassifier(n_neighbors=3)

In [26]:
knn # there is three neighbors

# Train The Model

In [27]:
knn.fit(X_train, y_train) #fit the model for trianing

# Evaluate the Model

In [29]:
# evaluate the model’s performance - Accuracy And Mean Squared Error (MSE)

from sklearn.metrics import accuracy_score
# Make Prediction on the test set

y_prediction=knn.predict(X_test)

# And Calculate the Accuracy
accuracy=accuracy_score(y_test, y_prediction)
print(f'Accuracy: {accuracy:.2f}')

# 1.0 = 100% Accuracy and 0.63 = 63% Accuracy

Accuracy: 1.00


# Tune the Model

In [30]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to tune 
param_grid = {'n_neighbors':[3,5,7,9]}

# Perform Grid Search
grid_search = GridSearchCV(knn,param_grid,cv=5)
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters: ",grid_search.best_params_)

Best Parameters:  {'n_neighbors': 3}


# Advance Concepts

## PipeLines

In [32]:
# Combinne preprocessing and modeling into a single pipeline for cleaner code.

from sklearn.pipeline import Pipeline

#Create a pipeline

pipeline = Pipeline([('scaler',StandardScaler()),
                    ('knn',KNeighborsClassifier())])

# Train the Pipeline

pipeline.fit(X_train, y_train)

## Cross-Validation

In [38]:
# Evaluate the model’s performance more robustly using cross-validation.

from sklearn.model_selection import cross_val_score

# perform 5- fold cross-validation

scores = cross_val_score(knn, X, y, cv=5)
print("Cross-Validaton Score : ",scores)

Cross-Validaton Score :  [0.96666667 0.96666667 0.93333333 0.96666667 1.        ]


## Dimensionality Reduction

In [40]:
# Reduce the number of features using techniques like PCA (Principal Component Analysis).

from sklearn.decomposition import PCA

#reduce to 2 components

pca = PCA(n_components=2)
X_reduced=pca.fit_transform(X)

In [41]:
X_reduced

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,

## Ensemble Methods