# Data Preprocessing

## Importing the libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
df = pd.read_csv('Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])

## Encoding categorical data

### Encoding the independent variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [column_index_to_encode])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

### Encoding the dependent variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature Scaling

### Scaling features

In [None]:
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

### Scaling labels

In [None]:
from sklearn.preprocessing import StandardScaler
y_train = y_train.reshape(len(y_train), 1)
y_test = y_test.reshape(len(y_test), 1)
ss_y = StandardScaler()
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.fit_transform(y_test)

### Returning to original value

In [None]:
ss_X.inverse_transform(X_train)
ss_X.inverse_transform(X_test)
ss_y.inverse_transform(y_train)
ss_y.inverse_transform(y_test)

# Visualisation

## Regression

### Visualising the testing set results

In [None]:
plt.scatter(X_test, y_test, color='red') # Use train for training set
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

### Visualising the test set results (higher resolution)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid), 1)
plt.scatter(X_test, y_test, color='red') # Use train for training set
plt.plot(X_grid_train, regressor.predict(X_grid_train), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

### Visualising the test set results (higher resolution, with feature scaling)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid), 1)
plt.scatter(ss_X.inverse_transform(X_test), ss_y.inverse_transform(y_test), color='red') # Use train for training set
plt.plot(ss_X.inverse_transform(X_grid_train), ss_y.inverse_transfor(regressor.predict(X_grid_train)), color='blue')
plt.title('y vs x (Test set)')
plt.xlabel('x')
plt.ylabel('y')

## Classification

### Visualising the testing set results (higher resolution, slower)

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = ss_X.inverse_transform(X_test), y_test # X_train, y_train for training set
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 10, stop = X_set[:, 0].max() + 10, step = 0.25),
                     np.arange(start = X_set[:, 1].min() - 1000, stop = X_set[:, 1].max() + 1000, step = 0.25))
plt.contourf(X1, X2, classifier.predict(ss_X.transform(np.array([X1.ravel(), X2.ravel()]).T)).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier Model (Test/Training set)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

### Visualising the training set results (lower resolution, faster)

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train # X_train, y_train for training set
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Classifier Model (Test/Training set)')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

# Predictions

## Predicting using test set

y_pred = model.predict(X_test)

## Predicting a value

In [None]:
print(model.predict([[x1_value, x2_value]]))

## Printing predicted values and actual values side-by-side

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

## Predicting using test set (with feature scaling)

In [None]:
y_pred = ss_y.inverse_transform(model.predict(X))

## Predicting a value (with feature scaling)

In [None]:
print(model.predict(ss_X.transform([[x1_value, x2_value]])))

## Printing predicted values and actual values side-by-side (with feature scaling)

In [None]:
y_test = ss_y.inverse_transform(y_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

# Evaluating Model Performance

Metrics: https://scikit-learn.org/stable/modules/classes.html

## Regression

### Regression Error Metrics

In [None]:
from sklearn import metrics
print(metrics.r2_score(y_test, y_pred))
print(metrics.mean_absolute_error(y_test, y_pred))
print(metrics.mean_squared_error(y_test, y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Plotting the residuals

In [None]:
sns.distplot((y_test-y_pred)) # If it's a correct model choice, it should be normally distributed
plt.xlabel('Residuals')

### Plotting the actual and predicted values

In [None]:
sns.scatterplot(x=y_test, y=y_pred) # If the model fitted well, it should be a straight line
plt.xlabel('y_test')
plt.ylabel('y_pred')

## Classification

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
accuracy_score(y_test, y_pred)

# Regression models

## Simple Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Getting linear equation

In [None]:
print(regressor.coef_)
print(regressor.intercept_)
pd.DataFrame(regressor.coef_, X_train.columns, columns=['Coeff'])

## Multiple Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Getting linear equation

In [None]:
print(regressor.coef_)
print(regressor.intercept_)
pd.DataFrame(regressor.coef_, df.columns[:-1], columns=['Coefficient'])

## Polynomial Linear Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

No need for feature scaling

Non-linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = number_of_polynomials)
X_train_poly = poly_features.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)

### Predicting a value

In [None]:
print(regressor.predict(poly_features.fit_transform([[x1_value, x2_value]])))

### Getting linear equation

In [None]:
print(regressor.coef_)
print(regressor.intercept_)
pd.DataFrame(regressor.coef_, X_train.columns, columns=['Coefficient'])

### Visualising the test set results (higher resolution)

In [None]:
X_grid_train = np.arange(min(X), max(X), 0.1)
X_grid_train = X_grid_train.reshape(len(X_grid), 1)
plt.scatter(X_test, y_test, color='red')
plt.plot(X_grid_train, regressor.predict(poly_reg.fit_transform(X_grid_train)), color='blue')
plt.title('y vs x (Test set)')
plt.ylabel('y')
plt.xlabel('x')

## Support Vector Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html

Needs feature scaling

Non-linear

Continuous

### Fitting

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel='kernal_name')
regressor.fit(X_train, y_train)

## Decision Tree Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

## Random Forest Regressor

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

No need for feature scaling

Non-linear

Non-continuous

### Fitting

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=n_of_trees)
regressor.fit(X_train, y_train)

# Classification models

## Logistic Regression Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Needs feature scaling

Linear

Continuous

### Fitting

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

## KNN Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
    
Needs feature scaling

Non-linear

Continuous

### Fitting

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=n_of_neighbors, metric='distance_metric')
classifier.fit(X_train, y_train)

## SVM Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    
Needs feature scaling

Non-linear (unless if using the linear kernel)

Continuous

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel='kernel_name')
classifier.fit(X_train, y_train)