# Basic Regression

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score

### Data


In [None]:
# Using the pandas plot.scatter
df.plot.scatter('Population', 'AAWDTF', s=100, marker='.', color="cornflowerblue")
plt.show()


In [None]:
# Visualize the dataset
df.plot(legend=False)
plt.tight_layout()
plt.show()


### Preprocessing

In [None]:
# Find missing values
missing_values = df.isna().sum()
print(missing_values)


In [None]:
# Possibly use interpolation for missing values

# Create a function we'll use to interpolate and plot
def interpolate_and_plot(prices, interpolation):

    # Create a boolean mask for missing values
    missing_values = df.isna()

    # Interpolate the missing values
    df_interp = df.interpolate(interpolation)

    # Plot the results, highlighting the interpolated values in black
    fig, ax = plt.subplots(figsize=(10, 5))
    df_interp.plot(color='k', alpha=.6, ax=ax, legend=False)
    
    # Now plot the interpolated values on top in red
    df_interp[missing_values].plot(ax=ax, color='r', lw=3, legend=False)
    plt.show()

In [None]:
X = df[['', '']]


In [None]:
X = _df['Pop'].values.reshape(-1, 1)
y = _df['AAWDTF'].values.reshape(-1, 1)


In [None]:
# Create train and test set 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    shuffle=True)


In [None]:
# Create linear regression object
reg = linear_model.LinearRegression()

In [None]:
# Use model to fit to the training data

reg.fit(X_train, y_train)
print(X_train.shape)
print(regr.coef_)
print(reg.intercept_)


In [None]:
# Predict on the test set
y_predict = reg.predict(X_test)


In [None]:
# The mean squared error
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_predict))


In [None]:
print('Coefficient of determination: %.2f' % r2_score(y_test, y_predict))


### Plotting predicted versus datapoints

In [None]:
fig,ax = plt.subplots(1,2,figsize=(10,5))
ax[0].scatter(y_test, y_test-y_predict, marker='o')
ax[0].set_title('Residual plot for linear fit\n', fontsize=15)

degree = 5
model = make_pipeline(PolynomialFeatures(degree), linear_model.LinearRegression())
model.fit(X, y)
y_poly = model.predict(X_test)

ax[1].scatter(y_test, y_test-y_poly, c="purple")
ax[1].set_title('Residual plot for polynomial fit\n', fontsize=15)

plt.tight_layout()


## Ridge Regression

In [None]:
# Split our data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=.8, 
                                                    shuffle=False, 
                                                    random_state=1)

# Fit our model and generate predictions
model = Ridge()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
score = r2_score(y_test, predictions)
print(score)


In [None]:
# Visualize our predictions along with the "true" values, and print the score
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(y_test, color='k', lw=3)
ax.plot(predictions, color='r', lw=2)
plt.show()
