# Data Preprocessing

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
df = pd.read_csv('Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])

## Encoding categorical data

### Encoding the independent variable (considering multicollinearity)

In [None]:
def sum_prev (l_in):
    l_out = []
    l_out.append(l_in[0])
    for i in range(len(l_in)-1):
        l_out.append(l_out[i] + l_in[i+1])
    return [e - 1 for e in l_out]

# df and X must have the same data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
columns_to_encode = [0, 2, 3] # Change here
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_to_encode)], remainder='passthrough')
columns_to_encode = [df.iloc[:, del_idx].nunique() for del_idx in columns_to_encode]
columns_to_encode = sum_prev(columns_to_encode)
X = np.array(ct.fit_transform(X))
X = np.delete(X, columns_to_encode, 1)

### Encoding the independent variable (not considering multicollinearity)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [column_index_to_encode])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

### Encoding the dependent variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Oversampling and undersampling

In [None]:
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
sm = SMOTE()
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

## Feature Scaling

### Scaling features

In [None]:
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

### Scaling labels

In [None]:
from sklearn.preprocessing import StandardScaler
y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)
ss_y = StandardScaler()
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.fit_transform(y_test)

### Returning to original value

In [None]:
ss_X.inverse_transform(X_train)
ss_X.inverse_transform(X_test)
ss_y.inverse_transform(y_train)
ss_y.inverse_transform(y_test)

# Hyperparameter tuning using grid search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'param_1': [arg_1, arg_2], 'param_2': [arg_1, arg_2]}
grid = GridSearchCV(ml_model(), param_grid, cv=num_for_k_folds, verbose=2)
grid.fit(X_train, y_train)
grid.best_params_
grid.best_estimator_
grid_predictions = grid.predict(X_test)

# Predictions

## Predicting using test set

y_pred = model.predict(X_test)

## Predicting a value

In [None]:
print(model.predict([[x1_value, x2_value]]))

## Printing predicted values and actual values side-by-side

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

## Predicting using test set (with feature scaling)

In [None]:
y_pred = ss_y.inverse_transform(model.predict(X))

## Predicting a value (with feature scaling)

In [None]:
print(model.predict(ss_X.transform([[x1_value, x2_value]])))

## Printing predicted values and actual values side-by-side (with feature scaling)

In [None]:
y_test = ss_y.inverse_transform(y_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

# Evaluating Model Performance

Metrics: https://scikit-learn.org/stable/modules/classes.html

## Regression

### Regression Error Metrics

In [None]:
from sklearn import metrics
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Plotting the residuals

In [None]:
sns.distplot((y_test-y_pred)) # If it's a correct model choice, it should be normally distributed
plt.xlabel('Residuals')

### Plotting the actual and predicted values

In [None]:
sns.scatterplot(x=y_test, y=y_pred) # If the model fitted well, it should be a straight line
plt.xlabel('y_test')
plt.ylabel('y_pred')

## Classification

In [None]:
from sklearn import metrics
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
print('Confusion matrix:\n', metrics.confusion_matrix(y_test, y_pred))
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
print('Precision:', metrics.precision_score(y_test, y_pred))
print('Recall:', metrics.recall_score(y_test, y_pred))
print('F1-Score:', metrics.f1_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred)) # Better for multiclass problem

# Visualisation

## Regression

### Visualising the testing set results

In [None]:
plt.scatter(X_test, y_test, color='red') # Use train for training set
plt.plot(X_train, regression.predict(X_train), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

### Visualising the test set results (higher resolution)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid), 1)
plt.scatter(X_test, y_test, color='red') # Use train for training set
plt.plot(X_grid_train, regression.predict(X_grid_train), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

### Visualising the test set results (higher resolution, with feature scaling)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid), 1)
plt.scatter(ss_X.inverse_transform(X_test), ss_y.inverse_transform(y_test), color='red') # Use train for training set
plt.plot(ss_X.inverse_transform(X_grid_train), ss_y.inverse_transfor(regression.predict(X_grid_train)), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

## Classification

### Visualising the testing set results (higher resolution, slower)

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = ss_X.inverse_transform(X_test), y_test # X_train, y_train for training set
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(ss_X.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

### Visualising the training set results (lower resolution, faster)

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train # X_train, y_train for training set
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

# Regression models

## Simple Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, y_train)

### Getting attributes

In [None]:
print(regression.coef_)
print(regression.intercept_)
pd.DataFrame(regressor.coef_, X_train.columns, columns=['Coefficient'])

## Multiple Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, y_train)

### Getting attributes

In [None]:
print(regression.coef_)
print(regression.intercept_)
pd.DataFrame(regression.coef_, df.columns[:-1], columns=['Coefficient'])

## Polynomial Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Non-linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = number_of_polynomials)
X_train_poly = poly_features.fit_transform(X_train)
regression = LinearRegression()
regression.fit(X_train_poly, y_train)

### Predicting a value

In [None]:
print(regression.predict(poly_features.fit_transform([[x1_value, x2_value]])))

### Getting linear equation

In [None]:
print(regression.coef_)
print(regression.intercept_)
pd.DataFrame(regression.coef_, X_train.columns, columns=['Coefficient'])

### Visualising the test set results (higher resolution)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid_train), 1)
plt.scatter(X_test, y_test, color='red')
plt.plot(X_grid_train, regression.predict(poly_reg.fit_transform(X_grid_train)), color='blue')
plt.title('y vs x (Test set)')
plt.ylabel('y')
plt.xlabel('x')

### Hyperparameter tuning using grid search

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score 
degrees = [2, 3, 4, 5, 6] # Change degree "hyperparameter" here
normalizes = [True, False] # Change normalize hyperparameter here
best_score = 0
best_degree = 0
for degree in degrees:
    for normalize in normalizes:
        poly_features = PolynomialFeatures(degree = degree)
        X_train_poly = poly_features.fit_transform(X_train)
        polynomial_regression = LinearRegression(normalize=normalize)
        polynomial_regression.fit(X_train_poly, y_train)
        scores = cross_val_score(polynomial_regression, X_train_poly, y_train, cv=5) # Change k-fold cv value here
        if max(scores) > best_score:
            best_score = max(scores)
            best_degree = degree
            best_normalize = normalize

## Support Vector Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

Needs feature scaling

Non-linear

Continuous

### Fitting

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel='kernal_name')
regressor.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
     'kernel': ['rbf', 'sigmoid']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
     'kernel': ['linear']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
     'degree': [2, 3, 4, 5, 6],
     'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [0.1, 1, 10, 100, 1000], 
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
     'kernel': ['rbf']},
    {'C': [0.1, 1, 10, 100, 1000],
     'kernel': ['linear']},
    {'C': [0.1, 1, 10, 100, 1000],
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
     'degree': [2, 3],
     'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid, verbose=2)

## Decision Tree Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['mse', 'friedman_mse', 'mae'],
    'max_depth': [None, 5, 10, 20, 30, 40, 50, 80, 90, 100, 110],
    'max_features': [2, 3, 5, 10, 'auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10, 15, 100],
    'min_samples_split': [2, 5, 8, 10, 12, 15, 20],
    'n_estimators': [100, 200, 300, 500, 800, 1000, 1500, 2500]
}
grid = GridSearchCV(RandomForestRegressor(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [10, 40, 70, 100],
    'max_features': [2, 3, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 6, 10, 14],
    'min_samples_split': [2, 6, 10, 14],
    'n_estimators': [100, 300]
}
grid = GridSearchCV(RandomForestRegressor(), param_grid, verbose=2)

## Random Forest Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=200)
regressor.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'max_depth': [None, 5, 10, 20, 30, 40, 50, 80, 90, 100, 110],
    'max_features': [2, 3, 5, 10, 'auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10, 15, 100],
    'min_samples_split': [2, 5, 8, 10, 12, 15, 20],
    'n_estimators': [100, 200, 300, 500, 800, 1000, 1500, 2500]
}
grid = GridSearchCV(RandomForestRegressor(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [10, 40, 70, 100],
    'max_features': [2, 3, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 6, 10, 14],
    'min_samples_split': [2, 6, 10, 14],
    'n_estimators': [100, 300]
}
grid = GridSearchCV(RandomForestRegressor(), param_grid, verbose=2)

# Supervised learning

## Logistic Regression Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Needs feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'penalty': ['none', '11', '12', 'elasticnet'], 
    'C': list(np.logspace(0, 4, 10)) + [0.0001, 0.01, 0.05, 0.2, 10, 1000],
    'solver': ["newton-cg", "lbfgs", "liblinear", 'sag', 'saga'],
    'dual': [True, False]
}
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'penalty': ['11', '12'], 
    'C': [0.001, 0.01, 1, 10, 100],
    'solver': ["newton-cg", "lbfgs"]
}
grid = GridSearchCV(LogisticRegression(), param_grid, verbose=2)

## KNN Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    
Needs feature scaling

Non-linear

Continuous

### Elbow method to find optimal number of clusters

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
error_rate = []

for i in range(1, 40, 2):
    knn_elbow = KNeighborsClassifier(n_neighbors=i)
    knn_elbow.fit(X_train, y_train)
    y_pred = knn_elbow.predict(X_test)
    error_rate.append(1 - accuracy_score(y_test, y_pred))

plt.figure(figsize=(10,6))
plt.plot(range(1, 40, 2), error_rate, color='blue', ls='--', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

### Fitting

In [None]:
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_neighbors': list(range(1, 31, 2)),
     'weights': ['uniform', 'distance'],
     'metric': ['euclidean', 'manhattan', 'minkowski']},
    {'n_neighbors': list(range(1, 31, 2)),
     'weights': ['uniform', 'distance'],
     'p': [3, 4, 5]
     'metric': ['minkowski']}
]
grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': list(range(1, 25, 2)),
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=2)

## SVM Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    
Needs feature scaling

Non-linear (unless if using the linear kernel)

Continuous

### Fitting

In [None]:
from sklearn.svm import SVC
# The larger the C value the more the model will overfit
# The larger the gamma value the more the model will underfit
classifier = SVC(kernel='rbf')
classifier.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
     'kernel': ['rbf', 'sigmoid']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
     'kernel': ['linear']},
    {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
     'degree': [2, 3, 4, 5, 6],
     'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'C': [ 0.1, 1, 10, 100, 1000], 
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
     'kernel': ['rbf']},
    {'C': [0.1, 1, 10, 100, 1000],
     'kernel': ['linear']},
    {'C': [0.1, 1, 10, 100, 1000],
     'gamma': [0.0001, 0.001, 0.01, 0.1, 1],
     'degree': [2, 3],
     'kernel': ['poly']}
]
grid = GridSearchCV(SVC(), param_grid, verbose=2)

## Decision Tree Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.htmlhtml

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X, y)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20, 30, 40, 50, 80, 90, 100, 110],
    'max_features': [2, 3, 5, 10, 'auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10, 15, 100],
    'min_samples_split': [2, 5, 8, 10, 12, 15, 20],
    'n_estimators': [100, 200, 300, 500, 800, 1000, 1500, 2500]
}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 40, 70, 100],
    'max_features': [2, 3, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 6, 10, 14],
    'min_samples_split': [2, 6, 10, 14],
    'n_estimators': [100, 300]
}
grid = GridSearchCV(DecisionTreeClassifier(), param_grid, verbose=2)

## Random Forest Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.ensemble import RandomForestClassifier
regressor = RandomForestClassifier(n_estimators=200)
regressor.fit(X_train, y_train)

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'max_depth': [None, 5, 10, 20, 30, 40, 50, 80, 90, 100, 110],
    'max_features': [2, 3, 5, 10, 'auto', 'sqrt', 'log2'],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10, 15, 100],
    'min_samples_split': [2, 5, 8, 10, 12, 15, 20],
    'n_estimators': [100, 200, 300, 500, 800, 1000, 1500, 2500]
}
grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=2)

### Hyperparameter tuning using grid search (less in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 40, 70, 100],
    'max_features': [2, 3, 'sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 6, 10, 14],
    'min_samples_split': [2, 6, 10, 14],
    'n_estimators': [100, 300]
}
grid = GridSearchCV(RandomForestClassifier(), param_grid, verbose=2)

## Naive Bayes

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNBm

Needs feature scaling

Non-linear

Continuous

No hyperparameters to tune

### Fitting

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

### Hyperparameter tuning using grid search (less in-depth)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'alpha': [0.001, 0.01, 1.0]
}
grid = GridSearchCV(MultinomialNB(), param_grid, verbose=2)

# Unsupervised learning

## k-Means

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

Needs feature scaling

### Elbow method to find optimal number of clusters

In [None]:
from sklearn.cluster import KMeans
wcss = []

for k in range(2, 21):
    kmeans_elbow = KMeans(n_clusters=k)
    kmeans_elbow.fit(X)
    wcss.append(kmeans_elbow.inertia_)

plt.figure(figsize=(10,6))
plt.plot(range(2, 21), wcss, ls='--', marker='o', markerfacecolor='red', markersize=10)
plt.title('WCSS vs k')
plt.xlabel('k')
plt.ylabel('WCSS')

### Fitting

In [None]:
kmeans = KMeans(n_clusters=8)
kmeans.fit(df)

### Visualising the clusters

In [None]:
colors = ['red', 'blue', 'green', 'cyan', 'magenta']

for i in range(kmeans.n_clusters):
    plt.scatter(X[kmeans.labels_ == i, 0], X[kmeans.labels_ == i, 1], c=colors[i], s=50, label='Cluster ' + str(i))

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=150, c='black', label='Centroids')  
plt.title('Clusters')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()

## Hierarchical Clustering

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

Needs feature scaling

### Dendogram to find optimal number of clusters

In [None]:
import scipy.cluster.hierarchy as sch
dendogram = sch.dendrogram(sch.linkage(X, metric='euclidean', method='ward'), no_labels=True)
plt.title('Dendogram')
plt.xlabel('Rows')
plt.ylabel('Euclidean distance')

### Fitting

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_pred = hc.fit_predict(X)

### Visualising the clusters

In [None]:
colors = ['red', 'blue', 'green', 'cyan', 'magenta']

for i in range(hc.n_clusters):
    plt.scatter(X[hc.labels_ == i, 0], X[hc.labels_ == i, 1], c=colors[i], s=50, label='Cluster ' + str(i))

plt.title('Clusters')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()

# Reinforcement Learning

## Upper Confidence Bound

### Fitting

In [None]:
import math

N = df.shape[0]
d = df.shape[1]
options_selected = []
numbers_of_selections = [1] * d
sums_of_rewards = [0] * d
total_reward = 0

for n in range(N):
    option = 0
    max_upper_bound = 0
    for i in range(d):
        average_reward = sums_of_rewards[i] / numbers_of_selections[i]
        delta_i = math.sqrt(3/2 * math.log(n+1) / numbers_of_selections[i])
        upper_bound = average_reward + delta_i
        if upper_bound > max_upper_bound:
            max_upper_bound = upper_bound
            option = i
    options_selected.append(option)
    numbers_of_selections[option] += 1
    reward = df.iloc[n, option]
    sums_of_rewards[option] += reward
    total_reward += reward

### Visualising the options selected

In [None]:
plt.hist(options_selected)
plt.title('Histogram of options selections')
plt.xlabel('Options')
plt.ylabel('Number of times each option was selected by UCB')

## Thompson Sampling

### Fitting

In [None]:
import random

N = df.shape[0]
d = df.shape[1]
options_selected = []
number_of_rewards_1 = [0] * d
number_of_rewards_0 = [0] * d
total_reward = 0

for n in range(N):
    option = 0
    max_random_beta = 0
    for i in range(d):
        random_beta = random.betavariate(number_of_rewards_1[i] + 1, number_of_rewards_0[i] + 1)
        if random_beta > max_random_beta:
            max_random_beta = random_beta
            option = i
    options_selected.append(option)
    reward = df.iloc[n, option]
    if reward == 1:
        number_of_rewards_1[option] += 1
    else:
        number_of_rewards_0[option] += 1
    total_reward += reward

### Visualising the options selected

In [None]:
plt.hist(options_selected)
plt.title('Histogram of ads selections')
plt.xlabel('Ads')
plt.ylabel('Number of times each ad was selected by the algorithm')

# Association Rule Learning

## Apriori

https://pypi.org/project/apyori/

### Visualising items' frequency

In [None]:
df.stack().value_counts(normalize=True)[:10].plot(kind='bar', title='Relative Frequency')
df.stack().value_counts().apply(lambda item: item / df.shape[0])[:10].plot(kind='bar', title='Frequency')

### Formatting the dataset (models's input is a list of lists)

In [None]:
transactions = []
for i in range(df.shape[0]):
    row = df.iloc[i].dropna().tolist()
    transactions.append(row)

### Fitting

In [None]:
from apyori import apriori
rules = apriori(transactions=transactions, 
                min_support=0.2, # how frequent is your item(s) in the dataset 
                min_confidence=0.6, # how often your rule will work
                min_lift=2, # how better off you are compared to pure randomness
                min_length=0, 
                max_length=2)
results = list(rules)

### Organizing the output

In [None]:
def inspect(results):
    item_sets = []
    supports = [] 
    lhs = []
    rhs = []
    confidences = []
    lifts = []
    for result in results:
        for subset in result[2]:
            item_sets.append(tuple(result[0]))
            supports.append(result[1])
            lhs.append(tuple(subset[0]))
            rhs.append(tuple(subset[1]))
            confidences.append(subset[2])
            lifts.append(subset[3])
    return list(zip(item_sets, lhs, rhs, supports, confidences, lifts))
results_df = pd.DataFrame(inspect(results),
                          columns = ['Item Set', 'Left Hand Side', 'Right Hand Side', 'Support', 'Confidence', 'Lift'])

## Eclat

https://pypi.org/project/apyori/

### Visualising items' frequency

In [None]:
df.stack().value_counts(normalize=True)[:10].plot(kind='bar', title='Relative Frequency')
df.stack().value_counts().apply(lambda item: item / df.shape[0])[:10].plot(kind='bar', title='Frequency')

### Formatting the dataset (models's input is a list of lists)

In [None]:
transactions = []
for i in range(df.shape[0]):
    row = df.iloc[i].dropna().tolist()
    transactions.append(row)

### Fitting

In [None]:
# !pip install apyori
from apyori import apriori
rules = apriori(transactions=transactions, 
                min_support=0.2, # how frequent is your item(s) in the dataset
                min_length=0, 
                max_length=2)
results = list(rules)

### Organizing the output

In [None]:
def inspect(results):
    item_sets = []
    supports = [] 
    for result in results:
        item_sets.append(tuple(result[0]))
        supports.append(result[1])
    return list(zip(item_sets, supports))
results_df = pd.DataFrame(inspect(results), 
                          columns = ['Item Set', 'Support'])

# Natural Language Processing

## Bag of Words and TF-IDF

Example using Linear SVM, Kernel SVM and Naive Bayes

### Text process

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def text_process(document):
    document = re.sub('[^a-zA-Z]', ' ', document)
    document = document.lower()
    document = document.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    document = [word for word in document if not word in set(all_stopwords)]
    ps = PorterStemmer()
    document = [ps.stem(word) for word in document]
    return document

### Hyperparameter tuning using grid search (more in-depth)

In [None]:
linear_SVM_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2), (1 ,3), (1, 4), (2, 2)],
    'bag_of_words__max_df': [0.25, 0.5, 0.75, 0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05, 0.1, 0.15, 0.2],
    'bag_of_words__binary': [True, False],
    'estimator__kernel': ['linear'],
    'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

kernel_SVM_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2), (1 ,3), (1, 4), (2, 2)],
    'bag_of_words__max_df': [0.25, 0.5, 0.75, 0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05, 0.1, 0.15, 0.2],
    'bag_of_words__binary': [True, False],
    'estimator__kernel': ['rbf'],
    'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'estimator__gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

naive_bayes_SVM_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2), (1 ,3), (1, 4), (2, 2)],
    'bag_of_words__max_df': [0.25, 0.5, 0.75, 0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05, 0.1, 0.15, 0.2],
    'bag_of_words__binary': [True, False],
    'estimator__alpha': [0.001, 0.01, 1.0]
}

### Hyperparameter tuning using grid search (less in-depth)

In [None]:
linear_SVM_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__kernel': ['linear'],
    'estimator__C': [0.1, 1, 10, 100, 1000]
}

kernel_SVM_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__kernel': ['linear'],
    'estimator__kernel': ['rbf'],
    'estimator__C': [0.1, 1, 10, 100, 1000],
    'estimator__gamma': [0.0001, 0.001, 0.01, 0.1, 1]
}

nb_param_grid = {
    'bag_of_words__ngram_range': [(1, 1), (1, 2)],
    'bag_of_words__max_df': [0.85, 1.0],
    'bag_of_words__min_df': [0.01, 0.05],
    'estimator__kernel': ['linear'],
    'estimator__alpha': [0.001, 0.01, 1.0]
}

### Creating a pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

SVM_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', SVC())
])

nb_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process)),
    ('tf_idf', TfidfTransformer()),
    ('estimator', MultinomialNB())
])

### Fitting

In [None]:
from sklearn.model_selection import GridSearchCV
linear_SVM_grid = GridSearchCV(SVM_pipe, linear_SVM_param_grid, verbose=2, cv=2)
linear_SVM_grid.fit(X_train, y_train)

kernel_SVM_grid = GridSearchCV(SVM_pipe, kernel_SVM_param_grid, verbose=2, cv=2)
kernel_SVM_grid.fit(X_train, y_train)

nb_grid = GridSearchCV(nb_pipe, nb_param_grid, verbose=2, cv=2)
nb_grid.fit(X_train, y_train)

### Getting attributes (if using Random Forest)

In [None]:
rf_pipe = Pipeline([
    ('bag_of_words', CountVectorizer(analyzer=text_process, best_params_)), # best_params given by rf_grid.best_params_
    ('tf_idf', TfidfTransformer()),
    ('estimator', RandomForestClassifier(best_params)) # best_params given by rf_grid.best_params_
])
rf_pipe.fit(X_train, y_train)


feature_importance = pd.DataFrame(rf_pipe.steps[2][1].feature_importances_, 
                                  rf_pipe.steps[0][1].get_feature_names(), 
                                  columns=['importance'])
feature_importance.sort_values('importance', ascending = False).head(20)

# Recommender System

## Preprocess

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

### Formatting the dataset

In [None]:
n_users = df.user_id_col_name.nunique()
n_items = df.item_id_col_name.nunique()

train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    # The "-1" is used if the user_id and/or item_id starts at 1
    train_data_matrix[line[user_col_position]-1, line[item_col_position]-1] = line[rating_col_index]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    # The -1 is used if the user_id and/or item_id starts at 1
    test_data_matrix[line[user_col_position]-1, line[item_col_position]-1] = line[rating_col_index]

## Collaborative Filtering Memotry-Based User-Based

### Calculating similarity

Using Cosine similarity:

<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(u_k,u_a)=\frac{u_k&space;\cdot&space;u_a&space;}{&space;\left&space;\|&space;u_k&space;\right&space;\|&space;\left&space;\|&space;u_a&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{k,m}x_{a,m}}{\sqrt{\sum&space;x_{k,m}^2\sum&space;x_{a,m}^2}}"/>

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')

## Predicting

User-based:
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?\hat{x}_{k,m}&space;=&space;\bar{x}_{k}&space;&plus;&space;\frac{\sum\limits_{u_a}&space;sim_u(u_k,&space;u_a)&space;(x_{a,m}&space;-&space;\bar{x_{u_a}})}{\sum\limits_{u_a}|sim_u(u_k,&space;u_a)|}"/>

In [None]:
def predict_user(ratings, similarity):
    mean_user_rating = ratings.mean(axis=1)
    ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
    pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred
user_prediction = predict_user(train_data_matrix, user_similarity)

## Evaluating

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))
print('Collaborative Filtering Memory-Based User-Based RMSE: ' + str(rmse(user_prediction, test_data_matrix)))

## Collaborative Filtering Memory-Based Item-Based

## Calculating similarity

Using Cosine similarity:

<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(i_m,i_b)=\frac{i_m&space;\cdot&space;i_b&space;}{&space;\left&space;\|&space;i_m&space;\right&space;\|&space;\left&space;\|&space;i_b&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{a,m}x_{a,b}}{\sqrt{\sum&space;x_{a,m}^2\sum&space;x_{a,b}^2}}
"/>

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

## Predicting

Item-based:
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?\hat{x}_{k,m}&space;=&space;\frac{\sum\limits_{i_b}&space;sim_i(i_m,&space;i_b)&space;(x_{k,b})&space;}{\sum\limits_{i_b}|sim_i(i_m,&space;i_b)|}"/>

In [None]:
def predict_item(ratings, similarity):
    pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred
item_prediction = predict_item(train_data_matrix, item_similarity)

## Evaluating

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth
print('Collaborative Filtering Memory-Based Item-Based RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

## Collaborative Filtering Model-Based

### Building the machine learning algorithm

Using Singular Value Decomposition:

<img src="https://latex.codecogs.com/gif.latex?X=USV^T" title="X=USV^T" />

In [None]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

# Choose k
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)

### Predicting

Using the dot product of *`U`*, *`S`* and *`V^T`*.

In [None]:
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

### Evaluating

In [None]:
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))