<a href="https://colab.research.google.com/github/Mohammadakhavan75/Introductio-to-Machine-Learning/blob/master/Introduction_to_Machine_Learning_HandsOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data preprocessing**

Standardization

In [0]:
from sklearn import preprocessing
import numpy as np

In [0]:
x_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

In [0]:
x_scaled = preprocessing.scale(x_train)

In [0]:
x_scaled

In [0]:
x_scaled.mean(axis=0)

In [0]:
x_scaled.std(axis=0)

In [0]:
scaler = preprocessing.StandardScaler().fit(x_train)
scaler

In [0]:
print(scaler.mean_, scaler.scale_)

In [0]:
scaler.transform(x_train)

In [0]:
x_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

In [0]:
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(X_train)

In [0]:
x_train_minmax

Non-linear transformation and normalization

In [0]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
x, y = load_iris(return_X_y=True)

In [0]:
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
x_trans = quantile_transformer.fit_transform(x)

In [0]:
np.percentile(x[:, 0], [0, 25, 50, 75, 100])

In [0]:
np.percentile(x_trans[:, 0], [0, 25, 50, 75, 100])

In [0]:
 # NOTE: Box-Cox can only be applied to strictly positive data.
 pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)

In [0]:
x_lognormal = np.random.RandomState(616).lognormal(size=(3, 3))

In [0]:
x_lognormal

In [0]:
pt.fit_transform(x_lognormal)

In [0]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
x_trans = quantile_transformer.fit_transform(x)

In [0]:
x = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
x_normalized = preprocessing.normalize(x, norm='l2')
x_normalized

In [0]:
normalizer = preprocessing.Normalizer().fit(x)
normalizer

In [0]:
normalizer.transform(x)

Encoding categorical features

In [0]:
enc = preprocessing.OrdinalEncoder()
x = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(x)
enc.transform([['female', 'from US', 'uses Safari']])

In [0]:
x = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox'], ['male', 'from Asia', 'uses Edge']]

In [0]:
enc.fit(x)
enc.transform(x)

Missing values

In [0]:
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv

In [0]:
from pandas import read_csv
# load the dataset
dataset = read_csv('pima-indians-diabetes.csv', header=None)
# summarize the dataset
print(dataset.describe())

In [0]:
print(dataset.head(20))

In [0]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [0]:
print(dataset.describe())

In [0]:
dataset[[1]]

In [0]:
# count the number of missing values for each column
num_missing = (dataset[[1,2,3,4,5]] == 0).sum()
# report the results
print(num_missing)

In [0]:
from numpy import nan

In [0]:
dataset.head(20)

In [0]:
dataset[[1,2,3,4,5]].replace(0, nan)

In [0]:
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)
# drop rows with missing values
dataset.dropna(inplace=True)

In [0]:
dataset.head(20)

In [0]:
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [0]:
dataset[[1,2,3,4,5]] = dataset[[1,2,3,4,5]].replace(0, nan)

In [0]:
dataset.fillna(dataset.mean(), inplace=True)

In [0]:
dataset.head(20)

In [0]:
print(dataset.isnull().sum())

In [0]:
dataset = read_csv('pima-indians-diabetes.csv', header=None)

In [0]:
from sklearn.impute import SimpleImputer

In [0]:
imputer = SimpleImputer(missing_values=nan, strategy='mean')

In [0]:
transformed_values = imputer.fit_transform(values)
# count the number of NaN values in each column
print('Missing: %d' % isnan(transformed_values).sum())

## **Seaborn**

Scatter plot

In [0]:
import seaborn as sns

In [0]:
sns.set()

In [0]:
tips = sns.load_dataset("tips")

In [0]:
tips

In [0]:
ax = sns.scatterplot(x="total_bill", y="tip", data=tips)

In [0]:
ax = sns.scatterplot(x="total_bill", y="tip", data=tips, hue="time", size="time")

In [0]:
ax = sns.scatterplot(x="total_bill", y="tip", data=tips, hue="size", size="size", sizes=(20, 200))

In [0]:
ax = sns.scatterplot(x="total_bill", y="tip", data=tips, hue="time", style="time")

In [0]:
iris = sns.load_dataset("iris")
ax = sns.scatterplot(x=iris.sepal_length, y=iris.sepal_width, hue=iris.species, style=iris.species)

Line plot

In [0]:
fmri = sns.load_dataset("fmri")

In [0]:
fmri

In [0]:
ax = sns.lineplot(x="timepoint", y="signal", data=fmri)

In [0]:
ax = sns.lineplot(x="timepoint", y="signal", hue="event", data=fmri)

In [0]:
ax = sns.lineplot(x="timepoint", y="signal", hue="event", style="event", data=fmri)

In [0]:
ax = sns.lineplot(x="timepoint", y="signal", hue="region", style="event", data=fmri)

In [0]:
ax = sns.lineplot(x="timepoint", y="signal", hue="event", style="event", markers=True, dashes=False, data=fmri)

In [0]:
dots = sns.load_dataset("dots").query("align == 'dots'")

In [0]:
dots

In [0]:
ax = sns.lineplot(x="time", y="firing_rate", hue="coherence", style="choice", data=dots)

In [0]:
palette = sns.color_palette("mako_r", 6)
ax = sns.lineplot(x="time", y="firing_rate", hue="coherence", style="choice", palette=palette, data=dots)

In [0]:
import numpy as np, pandas as pd
index = pd.date_range("1 1 2000", periods=100, freq="m", name="date")
data = np.random.randn(100, 4).cumsum(axis=0)
wide_df = pd.DataFrame(data, index, ["a", "b", "c", "d"])
ax = sns.lineplot(data=wide_df)

Histogram

In [0]:
# distribution plot
x = np.random.normal(size=100)
sns.distplot(x);

In [0]:
sns.distplot(x, kde=False)

In [0]:
sns.distplot(x, hist=False)

In [0]:
sns.distplot(x, bins=20, kde=False)

In [0]:
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])

In [0]:
sns.jointplot(x="x", y="y", data=df)

In [0]:
x, y = np.random.multivariate_normal(mean, cov, 1000).T
with sns.axes_style("white"):
    sns.jointplot(x=x, y=y, kind="hex", color="k")

In [0]:
sns.jointplot(x=x, y=y, kind="kde", color="k");

In [0]:
iris = sns.load_dataset("iris")
sns.pairplot(iris);

In [0]:
sns.pairplot(iris, hue="species")

In [0]:
ax = sns.barplot(x="day", y="total_bill", data=tips)

In [0]:
ax = sns.barplot(x="day", y="total_bill", hue="sex", data=tips)

In [0]:
uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(uniform_data)

In [0]:
ax = sns.heatmap(uniform_data, vmin=0, vmax=1)

In [0]:
flights = sns.load_dataset("flights")

In [0]:
flights

In [0]:
flights = flights.pivot("month", "year", "passengers")

In [0]:
flights

In [0]:
ax = sns.heatmap(flights)

In [0]:
ax = sns.heatmap(flights, annot=True, fmt="d")

In [0]:
ax = sns.boxplot(x=tips["total_bill"])

In [0]:
ax = sns.boxplot(x="day", y="total_bill", data=tips)

# **Loading data**

In [0]:
from sklearn.datasets import load_iris
iris_data = load_iris()

In [0]:
iris_data

In [0]:
x = iris_data.data
y = iris_data.target

In [0]:
x, y

In [0]:
x.shape, y.shape

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

# **K-nearest neighbor**

In [0]:
from sklearn.neighbors import KNeighborsClassifier

In [0]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [0]:
neigh.fit(x_train, y_train)

In [0]:
y_pred = neigh.predict(x_test)

In [0]:
y_pred

In [0]:
counter = 0
for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
        counter += 1
print(counter/len(y_test))

In [0]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [0]:
neigh.predict_proba(x_test)

# **Support-vector machine**

In [0]:
from sklearn.svm import SVC
clf = SVC()

In [0]:
clf.fit(x_train, y_train)

In [0]:
y_pred = clf.predict(x_test)

In [0]:
y_pred

In [0]:
accuracy_score(y_test, y_pred)

# **Linear regression**

In [0]:
from sklearn import datasets, linear_model
import numpy as np

In [0]:
x, y = datasets.load_diabetes(return_X_y=True)

In [0]:
x, y

In [0]:
x.shape, y.shape

In [0]:
x[:5]

In [0]:
x[:5, 2]

In [0]:
x[:5, np.newaxis, 2]

In [0]:
x = x[:, np.newaxis, 2]

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

In [0]:
regr = linear_model.LinearRegression()

In [0]:
regr.fit(x_train, y_train)

In [0]:
y_pred = regr.predict(x_test)

In [0]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [0]:
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))

In [0]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [0]:
import matplotlib.pyplot as plt
plt.scatter(x_test, y_test,  color='black')
plt.plot(x_test, y_pred, color='red', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

# **Decision tree**

In [0]:
x = iris_data.data
y = iris_data.target

In [0]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, shuffle=True)

In [0]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

In [0]:
y_pred = clf.predict(x_test)

In [0]:
accuracy_score(y_test, y_pred)

In [0]:
from sklearn import tree
tree.plot_tree(clf.fit(x_train, y_train)) 

In [0]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris") 

In [0]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=iris.feature_names,  class_names=iris.target_names,  filled=True, rounded=True,  special_characters=True)  
graph = graphviz.Source(dot_data)  
graph 

# **Random forest**

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=3)

In [0]:
clf.fit(x_train, y_train)

In [0]:
y_pred = clf.predict(x_test)

In [0]:
accuracy_score(y_pred, y_test)

# **Boosting**

In [0]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=10, random_state=0)

In [0]:
clf.fit(x_train, y_train)

In [0]:
y_pred = clf.predict(x_test)

In [0]:
accuracy_score(y_pred, y_test)