In [95]:
import sklearn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [96]:
dataset = load_iris()  # standard Iris dataset included in scikit-learn

In [97]:
X = dataset.data  # independent variables
y = dataset.target  # target variable

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)  # split the dataset into train and test (20% test)

In [99]:
model = RandomForestClassifier(n_estimators=100, random_state=12345)  # define the model (e.g. random forest with 100 decision trees)

In [100]:
model.fit(X_train, y_train)  # train the model

In [101]:
predictions = model.predict(X_test)  # test the model

In [102]:
accuracy = accuracy_score(y_test, predictions)  # calculate the accuracy of predictions based on the real test labels

In [103]:
print(f'Accuracy: {100*accuracy:.2f}%')  # print the accuracy to 2 decimal points

Accuracy: 93.33%


In [104]:
# example using KNN

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

model = KNeighborsClassifier(n_neighbors=3)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {100*accuracy:.2f}%")

Accuracy: 96.67%


In [116]:
from sklearn.datasets import load_iris

data = load_iris()
X = data.data
y = data.target

In [122]:
print(type(X))
print(type(y))
print(X.shape)
print(y.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(150, 4)
(150,)


In [127]:
print(X[:10],'\n')
print(y[:10])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]] 

[0 0 0 0 0 0 0 0 0 0]


In [119]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # automatically standardize the features

In [128]:
print(X_scaled[:10])

[[-0.90068117  1.01900435 -1.34022653 -1.3154443 ]
 [-1.14301691 -0.13197948 -1.34022653 -1.3154443 ]
 [-1.38535265  0.32841405 -1.39706395 -1.3154443 ]
 [-1.50652052  0.09821729 -1.2833891  -1.3154443 ]
 [-1.02184904  1.24920112 -1.34022653 -1.3154443 ]
 [-0.53717756  1.93979142 -1.16971425 -1.05217993]
 [-1.50652052  0.78880759 -1.34022653 -1.18381211]
 [-1.02184904  0.78880759 -1.2833891  -1.3154443 ]
 [-1.74885626 -0.36217625 -1.34022653 -1.3154443 ]
 [-1.14301691  0.09821729 -1.2833891  -1.44707648]]


In [129]:
# linear regression

from sklearn.linear_model import LinearRegression

model = LinearRegression()  # define model (with optional hyperparameters)
model.fit(X,y)  # train model on training data
predictions = model.predict(X)  # predict the test data

In [130]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
predictions = knn.predict(X)

In [131]:
# splitting dataset into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12345)  # we can specify test set size (as a % of train set), and a seed

In [None]:
# performance metrics

# accuracy for classification
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)  # first argument is the real labels, second is the predictions

# Mean Squared Error (MSE) for regression
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, predictions)