In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


np.random.seed(0)
n = 15
x = np.linspace(0,10,n) + np.random.randn(n)/5
y = np.sin(x)+x/6 + np.random.randn(n)/10


X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

# You can use this function to help you visualize the dataset by
# plotting a scatterplot of the data points
# in the training and test sets.
def part1_scatter():
    import matplotlib.pyplot as plt
    %matplotlib notebook
    plt.figure()
    plt.scatter(X_train, y_train, label='training data')
    plt.scatter(X_test, y_test, label='test data')
    plt.legend(loc=4);
    
    
# NOTE: Uncomment the function below to visualize the data, but be sure 
# to **re-comment it before submitting this assignment to the autograder**.   
part1_scatter()

<IPython.core.display.Javascript object>

### Question 1

Write a function that fits a polynomial LinearRegression model on the *training data* `X_train` for degrees 1, 3, 6, and 9. (Use PolynomialFeatures in sklearn.preprocessing to create the polynomial features and then fit a linear regression model) For each model, find 100 predicted values over the interval x = 0 to 10 (e.g. `np.linspace(0,10,100)`) and store this in a numpy array. The first row of this array should correspond to the output from the model trained on degree 1, the second row degree 3, the third row degree 6, and the fourth row degree 9.

<br>
*This function should return a numpy array with shape* `(4, 100)`

In [68]:
def answer_one():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures

    # Your code here
    x2 = X_train.reshape(-1,1)
    pred_list = []
    
    for i in (1,3,6,9):
        poly1 = PolynomialFeatures(degree=i).fit_transform(x2)
        X_train_poly,X_test,y_train_poly,y_test = train_test_split(poly1,y_train,random_state=0)
        linreg = LinearRegression().fit(X_train_poly,y_train_poly)
        
        inputs = np.linspace(0,10,100)
        inputs2 = inputs.reshape(-1,1)
        inputs3 = PolynomialFeatures(degree = i).fit_transform(inputs2)
        
        prediction = linreg.predict(inputs3)
        pred_list.append(prediction)
        
    arr = np.array(pred_list)

    # Return your answer
    return arr 
#answer_one().shape

(4, 100)

In [69]:
# feel free to use the function plot_one() to replicate the figure 
# from the prompt once you have completed question one
def plot_one(degree_predictions):
    import matplotlib.pyplot as plt
    %matplotlib notebook
    plt.figure(figsize=(10,5))
    plt.plot(X_train, y_train, 'o', label='training data', markersize=10)
    plt.plot(X_test, y_test, 'o', label='test data', markersize=10)
    for i,degree in enumerate([1,3,6,9]):
        plt.plot(np.linspace(0,10,100), degree_predictions[i], alpha=0.8, lw=2, label='degree={}'.format(degree))
    plt.ylim(-1,2.5)
    plt.legend(loc=4)

plot_one(answer_one())

<IPython.core.display.Javascript object>

### Question 2

Write a function that fits a polynomial LinearRegression model on the training data `X_train` for degrees 0 through 9. For each model compute the $R^2$ (coefficient of determination) regression score on the training data as well as the the test data, and return both of these arrays in a tuple.

*This function should return one tuple of numpy arrays `(r2_train, r2_test)`. Both arrays should have shape `(10,)`*

In [95]:
def answer_two():
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.metrics.regression import r2_score

    # Your code here
    x2 = X_train.reshape(-1,1)
    train_score = []
    test_score = []
    
    for i in range(0,10):
        poly = PolynomialFeatures(degree=i).fit_transform(x2)
        X_train_poly,X_test_poly,y_train_poly,y_test_poly = train_test_split(poly,y_train,random_state=0)
        linreg = LinearRegression().fit(X_train_poly,y_train_poly)
    
        score1 = linreg.score(X_train_poly,y_train_poly)
        score2 = linreg.score(X_test_poly,y_test_poly)
        train_score.append(score1)
        test_score.append(score2)
    
    train_arr = np.array(train_score)
    test_arr = np.array(test_score)
    
    result = train_arr,test_arr
        
    # Your answer here
    return result

answer_two()[0].shape

(10,)

### Question 3

Based on the $R^2$ scores from question 2 (degree levels 0 through 9), what degree level corresponds to a model that is underfitting? What degree level corresponds to a model that is overfitting? What choice of degree level would provide a model with good generalization performance on this dataset? 

Hint: Try plotting the $R^2$ scores from question 2 to visualize the relationship between degree level and $R^2$. Remember to comment out the import matplotlib line before submission.

*This function should return one tuple with the degree values in this order: `(Underfitting, Overfitting, Good_Generalization)`. There might be multiple correct solutions, however, you only need to return one possible solution, for example, (1,2,3).* 

In [None]:
def answer_three():
    
    # Your code here
    
    return # Return your answer

In [None]:
My script for Q1:

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import PolynomialFeatures

inputs = np.linspace(0,10,100)

predictions = np.ndarray((4,100))

for i, deg in enumerate([1,3,6,9]):

poly = PolynomialFeatures(degree = deg)

x_poly = poly.fit_transform(x.reshape(-1,1))

X_train,X_test,y_train,y_test = train_test_split(x_poly,y)

linreg = LinearRegression().fit(X_train,y_train)

inputs_poly = poly.fit_transform(inputs.reshape(-1,1))

predictions[i] = linreg.predict(inputs_poly)

return predictions

Q2:

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics.regression import r2_score

r2_scores = np.ndarray((10,2))

for i, deg in enumerate(range(1,10,1)):

poly = PolynomialFeatures(degree = deg)

x_poly = poly.fit_transform(x.reshape(-1,1))

X_train,X_test,y_train,y_test = train_test_split(x_poly,y)

linreg = LinearRegression().fit(X_train,y_train)

r2_scores[i][0] = linreg.score(X_train,y_train)

r2_scores[i][1] = linreg.score(X_test,y_test)

return r2_scores





Q1
X_train_poly = Create_Poly_Features(X_train)
lin_reg = Train_Linear_Regressor(X_train_poly)
X_predict = Create_Linear_Space()
predict...
Store values in list
Convert List to ndarray





np.random.seed(0)

n = 15

x = np.linspace(0,10,n) + np.random.randn(n)/5

y = np.sin(x)+x/6 + np.random.randn(n)/10

X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0)

Then:

poly1 = Polynomial Features For Degree 1

poly1 needs to be fit_transformed using X_train.reshape(-1,1)

then we need to fit the linear regression model using (the result of last step, y_train)

Now it's time to generate the test data:

var1=np.linspace(0,10,100)

var2=var1[:,np.newaxis]

var3=poly1.fit_transform(var2)

Finally we can predict: model.predict(var3)
