In [None]:
# importing essential libs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import datasets, linear_model
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [None]:
# Load the Diabetes dataset
columns = 'age sex bmi map tc ldl hdl tch ltg glu'.split() # Declare the columns names
diabetes = datasets.load_diabetes() # Call the diabetes dataset from sklearn
df = pd.DataFrame(diabetes.data, columns=columns) # load the dataset as a pandas data frame
y = diabetes.target # define the target variable (dependent variable) as y

In [None]:
df.head()

In [None]:
y[:5]

In [None]:
# create training and testing vars
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
## The line / model
plt.scatter(y_test, predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')

In [None]:
print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
print ('Train RMSE:', np.sqrt(mean_squared_error(lm.predict(X_train), y_train)))

# Feature Scaling

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
  
# Read Data from CSV 
data = pd.read_csv('regression.csv') 
data.head()

In [None]:
y = data['Value']

In [None]:
data = data.drop(['Name', 'Nationality', 'Club', 'Wage', 'Position', 'Weight', 'Height', 'Contract Valid Until', 'Value'], axis = 1)

In [None]:
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

#### MinMaxScaler

In [None]:
# Initialise the Scaler and Fit
norm = MinMaxScaler().fit(X_train)

In [None]:
X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))

In [None]:
X_train

In [None]:
X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))

In [None]:
X_test

### Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Initialise the Scaler and Fit
norm = StandardScaler().fit(X_train)

In [None]:
X_train  = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))

In [None]:
plt.hist(X_train.Age)
plt.show()

In [None]:
pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))

# tasks

###### try applying this scalers to any dataset and check the regression results

In [None]:
# Load the Diabetes dataset
columns = 'age sex bmi map tc ldl hdl tch ltg glu'.split() # Declare the columns names
diabetes = datasets.load_diabetes() # Call the diabetes dataset from sklearn
df = pd.DataFrame(diabetes.data, columns=columns) # load the dataset as a pandas data frame
y = diabetes.target # define the target variable (dependent variable) as y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
# Initialise the Scaler and Fit
norm = MinMaxScaler().fit(X_train)

In [None]:
X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))
X_train

In [None]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))
predictions = lm.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
print ('Train RMSE:', np.sqrt(mean_squared_error(lm.predict(X_train), y_train)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
# Initialise the Scaler and Fit
norm = StandardScaler().fit(X_train)

In [None]:
X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))
X_train

In [None]:
# fit a model
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))
predictions = lm.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
print ('Train RMSE:', np.sqrt(mean_squared_error(lm.predict(X_train), y_train)))

# One Hot Encoding

In [None]:
# Read Data from CSV 
data = pd.read_csv('regression.csv') 
data.head()

In [None]:
pd.get_dummies(data[['Position']])

# tasks

###### try applying this scalers to any column(s) and check the regression results

In [None]:
data = pd.read_csv('regression.csv') 
data.head() 

In [None]:
y = data.Value
data = data.drop(['Name', 'Nationality', 'Club', 'Wage', 'Position', 'Contract Valid Until', 'Height', 'Weight', 'Value'], axis = 1)

In [None]:
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:
norm = MinMaxScaler().fit(X_train)

In [None]:
X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))
X_train

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))
predictions = lm.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
print ('Train RMSE:', np.sqrt(mean_squared_error(lm.predict(X_train), y_train)))

In [None]:
data = pd.read_csv('regression.csv') 
y = data.Value
data = data.drop(['Name', 'Nationality', 'Club', 
                  'Wage', 'Position', 'Contract Valid Until', 
                  'Height', 'Weight', 'Value'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=42)

In [None]:
norm = StandardScaler().fit(X_train)

In [None]:
X_train = pd.DataFrame(columns = X_train.columns, data = norm.transform(X_train))

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
X_test = pd.DataFrame(columns = X_test.columns, data = norm.transform(X_test))
predictions = lm.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
print ('Test RMSE:', np.sqrt(mean_squared_error(predictions, y_test)))
print ('Train RMSE:', np.sqrt(mean_squared_error(lm.predict(X_train), y_train)))

## Outliers


In [None]:
import numpy as np
import matplotlib.pyplot as plt


# multiply and add by random numbers to get some real values
data = np.random.randn(50000)  * 20 + 20

# Function to Detection Outlier on one-dimentional datasets.
def find_anomalies(data):
    #define a list to accumlate anomalies
    anomalies = []
    
    # Set upper and lower limit to 3 standard deviation
    data_std = np.std(data)
    data_mean = np.mean(data)
    anomaly_cut_off = data_std * 3
    
    lower_limit  = data_mean - anomaly_cut_off 
    upper_limit = data_mean + anomaly_cut_off
    # Generate outliers
    for outlier in data:
        if outlier > upper_limit or outlier < lower_limit:
            anomalies.append(outlier)
    return anomalies, lower_limit, upper_limit

anomalies, lower, upper = find_anomalies(data)

In [None]:
plt.hist(data, alpha =0.5, bins = 30)
plt.hist(anomalies, color ="r", bins = 30)
plt.vlines(lower, 0, 6000, linestyles="dashed", color = "g")
plt.vlines(upper, 0, 6000, linestyles="dashed", color = "g")

plt.show

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(data=data)