# Data Preprocessing

## Importing the libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
df = pd.read_csv('Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## Missing data

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:,:])
X[:,:] = imputer.transform(X[:,:])

## Encoding categorical data

### Encoding the independent variable

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [column_index_to_encode])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

### Encoding the dependent variable

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train[:, columns_to_scale] = ss.fit_transform(X_train[:, columns_to_scale])

# Machine Learning models

## Simple Linear Regression

### Training

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Predicting

In [None]:
y_pred = regressor.predict(X_test)

### Visualising the training set results

In [None]:
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('x vs y (Training set)')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

### Visualising the test set results

In [None]:
plt.scatter(X_test, y_test, color='red')
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('x vs y (Test set)')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

### Predicting y based on x

In [None]:
print(regressor.predict([[x_value]]))

### Retrieves linear equation

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

## Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Predicting

In [None]:
y_pred = regressor.predict(X_test)

### Printing predicted values and actual values side-by-side

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

### Predicting y based on x

In [None]:
print(regressor.predict([[x1_value, x2_value]]))

### Retrieves linear equation

In [None]:
print(regressor.coef_)
print(regressor.intercept_)

## Polynomial Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree = number_of_polynomials)
X_train_poly = poly_features.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)

### Visualising the training set results

In [None]:
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, regressor.predict(X_train_poly), color='blue')
plt.title('x vs y (Training set)')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

### Visualising the test set results

In [None]:
plt.scatter(X_test, y_test, color='red')
plt.plot(X_train, regressor.predict(X_train_poly), color='blue')
plt.title('x vs y (Training set)')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

### Predicting y based on x

In [None]:
print(regressor.predict(poly_features.fit_transform([[x_value]])))