# **Basics of Machine Learning**

---


##Importing the Dataset

In [4]:
import pandas as pd
import numpy as np

columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
iris = pd.read_csv('/content/iris.data', header = None, names = columns)

iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [8]:
iris.size

750

In [9]:
iris.shape

(150, 5)

## Preprocessing the Data

In [10]:
iris.isnull().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [11]:
iris.isna().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [12]:
iris.notnull().sum()

Unnamed: 0,0
sepal_length,150
sepal_width,150
petal_length,150
petal_width,150
species,150


In [13]:
iris.duplicated().sum()

np.int64(3)

In [14]:
iris.drop_duplicates(inplace = True)
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


## Encoding the Target Feature

In [15]:
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()

In [16]:
iris['species'] = label_enc.fit_transform(iris['species'])
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


##Identifying Variables and Splitting Data

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = iris[["sepal_length", "sepal_width", "petal_length", "petal_width"]]
y = iris["species"]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [20]:
X_train

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
137,6.4,3.1,5.5,1.8
47,4.6,3.2,1.4,0.2
121,5.6,2.8,4.9,2.0
27,5.2,3.5,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
73,6.1,2.8,4.7,1.2
108,6.7,2.5,5.8,1.8
14,5.8,4.0,1.2,0.2
94,5.6,2.7,4.2,1.3


In [21]:
X_test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
127,6.1,3.0,4.9,1.8
53,5.5,2.3,4.0,1.3
140,6.7,3.1,5.6,2.4
19,5.1,3.8,1.5,0.3
106,4.9,2.5,4.5,1.7
12,4.8,3.0,1.4,0.1
78,6.0,2.9,4.5,1.5
31,5.4,3.4,1.5,0.4
83,6.0,2.7,5.1,1.6
9,4.9,3.1,1.5,0.1


In [22]:
y_train

Unnamed: 0,species
137,2
47,0
121,2
27,0
4,0
...,...
73,1
108,2
14,0
94,1


In [23]:
y_test

Unnamed: 0,species
127,2
53,1
140,2
19,0
106,2
12,0
78,1
31,0
83,1
9,0


## Normalizing the Data

In [24]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
scaler = MinMaxScaler()

In [26]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
import pandas as pd

In [28]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

In [29]:
X_train_scaled.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
137,0.583333,0.458333,0.758621,0.708333
47,0.083333,0.5,0.051724,0.041667
121,0.361111,0.333333,0.655172,0.791667
27,0.25,0.625,0.068966,0.041667
4,0.194444,0.666667,0.051724,0.041667


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [None]:
model.score(X_train_scaled, y_train)

0.9316793967423741

In [None]:
y_pred = model.predict(X_test_scaled)

In [None]:
model.score(X_test_scaled, y_test)

0.9171490867007299

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.054865715918183296


In [None]:
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Coefficients: [-0.10399321 -0.01105892  0.4579683   0.41616253]
Intercept: 1.0256410256410253


## Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [31]:
model = LogisticRegression(max_iter=200)
model.fit(X_train_scaled, y_train)

In [32]:
model.score(X_train_scaled, y_train)

0.9316239316239316

In [33]:
y_pred = model.predict(X_test_scaled)

In [34]:
model.score(X_test_scaled, y_test)

0.9666666666666667

In [35]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[11  0  0]
 [ 0 10  0]
 [ 0  1  8]]


In [36]:
class_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", class_rep)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.91      1.00      0.95        10
           2       1.00      0.89      0.94         9

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.96        30
weighted avg       0.97      0.97      0.97        30



## Naive-Bayes

In [37]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [38]:
model = GaussianNB()
model.fit(X_train_scaled, y_train)

In [39]:
model.score(X_train_scaled, y_train)

0.9658119658119658

In [40]:
y_pred = model.predict(X_test_scaled)

In [42]:
model.score(X_test_scaled, y_test)

0.9666666666666667

In [43]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[11  0  0]
 [ 0 10  0]
 [ 0  1  8]]


In [44]:
class_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", class_rep)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.91      1.00      0.95        10
           2       1.00      0.89      0.94         9

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.96        30
weighted avg       0.97      0.97      0.97        30



## Decision Tree

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
model = DecisionTreeClassifier(random_state=42)

In [47]:
model.fit(X_train_scaled, y_train)

In [48]:
y_pred = model.predict(X_test_scaled)

In [49]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[11  0  0]
 [ 0  9  1]
 [ 0  1  8]]


In [50]:
class_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", class_rep)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.90      0.90      0.90        10
           2       0.89      0.89      0.89         9

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



## K-Nearest Neighbors

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [53]:
model = KNeighborsClassifier(n_neighbors=5)

In [54]:
model.fit(X_train_scaled, y_train)

In [55]:
y_pred = model.predict(X_test_scaled)

In [56]:
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[11  0  0]
 [ 0  9  1]
 [ 0  1  8]]


In [57]:
class_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", class_rep)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.90      0.90      0.90        10
           2       0.89      0.89      0.89         9

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



## K-Means Clustering

In [58]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [80]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)

In [81]:
kmeans.fit(X_train_scaled)

In [82]:
cluster_labels = kmeans.predict(X_test_scaled)

In [83]:
inertia = kmeans.inertia_
print(f'Inertia Score: {inertia:.2f}')

Inertia Score: 5.58


In [84]:
silhouette_avg = silhouette_score(X_test_scaled, cluster_labels)
print(f'Silhouette Score: {silhouette_avg:.2f}')

Silhouette Score: 0.57
