
# Linear Regression with sklearn


Usage of Linear Regression ML method.It will obtain the train/test accuracy and plot linear regression obtained. 

It uses the ASTER satellite imaginery data of forested area in Japan, mapped with different forest types based on their spectral characteristics at visible-to-near infrared wavelengths:
https://archive.ics.uci.edu/ml/datasets/Forest+type+mapping# 

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

## 1.Import Japanese Forest Dataset

In [None]:
# importing the Forest Types Training Dataset
df = pd.read_csv('../forest.csv') #Dataframe for training.csv

#Extract the training targets
y_df = df[['class']]#Column to select from DataSets
y=y_df.values
y=y.reshape(-1)

#Extract the training data
X_tot = df.values

print(type(X_tot),"X_tot:",X_tot.shape)
print(type(y),"y:",y.shape)


## 2. Obtain train/test accuracy and coefficients
Calculates the accuracy between the feature and all the other targets in the Forest Dataset. Obtain the best accuracy to get the coefficients.

In [None]:
#Obtain the best test scores for between 1 target and the rest of attributes.

#Initialize the lists
training_accuracy = []
test_accuracy = []

for i in range(0,X_tot.shape[1],1):
    #print(i)
    X = X_tot[:, np.newaxis, i]     #Selecting each one colum data 
    #print(type(X),X.shape)
    #print(X)

    # Split the data into training/testing sets
    X_train = X[:-20]
    X_test = X[-20:]
    #print(type(X_train),X_train.shape)

    # Split the targets into training/testing sets
    y_train = y[:-20]

    y_test = y[-20:]
    #print(type(y_train),y_train.shape)
    
    # Create linear regression object
    regr = linear_model.LinearRegression()


    # Train the model using the training sets
    regr.fit(X_train, y_train)
    
    score = regr.score(X_test, y_test)
    
    if (score>0.7 and score!=1.0):
        print('['+str(i)+']'+'Score: ', score)
        training_accuracy.append([score,i])    
    
    test_accuracy.append(regr.score(X_train, y_train))

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)

best_target=max(training_accuracy)[1]
print("Max accuracy obtained:", max(training_accuracy)[0],", with column #:",best_target)

In [None]:
#Obtain the coeficients with the best fit

X = X_tot[:, np.newaxis,best_target]     #Selecting each one colum data 
# Split the data into training/testing sets
X_train = X[:-20]
X_test = X[-20:]
# Split the targets into training/testing sets
y_train = y[:-20]
y_test = y[-20:]
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

In [None]:
# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.xlabel("Feature")
plt.ylabel("Target")

plt.show()

## 3. Create plotting function for Cross-validation & Training Score

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):   
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

## 4. Obtain Cross-validation & Training Score 

In [None]:
# Obtain the cross-validation and training accuracy with the best target found 
title = "Learning Curves (Linear Regression), column=" + str(best_target)
# Cross validation with 100 iterations to get smoother mean test and train
# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

estimator =linear_model.LinearRegression()
plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)

For more information about Linear Regression, visit the following webpage:http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html