In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoLars
from sklearn.cluster import KMeans
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import TweedieRegressor
from math import sqrt
from scipy.stats import pearsonr, spearmanr
from scipy import stats

from env import get_connection
import prepare


# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

In [None]:

# our linear regression model

ols_model = LinearRegression()

ols_model.fit(train_scaled[['density']], train_scaled[['quality']])

# predicting on density after it's been fit

ols_model.predict(train_scaled[['density']])

# model predictions from above line of codes with 'yhat' as variable name and append it on to df
predictions_df['yhat'] = ols_model.predict(train_scaled[['density']])


# the residual column tells how far from the baseline we are from the actual

predictions_df['baseline_res'] = predictions_df['baseline_preds'] - predictions_df['quality'] 


## make a scatter plot

plt.scatter(x = predictions_df['density'], y = predictions_df['baseline_res'])

plt.xlabel('Density')
plt.ylabel('Baseline Residual')
plt.show()


# looking at difference between yhat predictions and actual preds['mpg']

predictions_df['yhat_res'] = predictions_df['yhat'] - predictions_df['quality']



## make a scatter plot of the model's prediction minus the actual preditcion of mpg

plt.scatter(x = predictions_df['density'], y = predictions_df['yhat_res'])

plt.xlabel('Density')
plt.ylabel('Predictions Residual')
plt.show()



In [None]:

# finding the RMSE in one step (x = original, y = prediction)

dens_qual_rmse = sqrt(mean_squared_error(predictions_df['quality'], predictions_df['baseline_preds']))
print(f'The RMSE of density against wine quality is {round(dens_qual_rmse,4)}.')

# RMSE of linear regression model

OLS_rmse = mean_squared_error(predictions_df['yhat'], predictions_df['quality'], squared = False)

print(f'The RMSE for the OLS Linear Regression model was {round(OLS_rmse, 4)}.')


In [None]:
# inertia loop

inertia = []
seed = 23

for i in range (1, 7):
    
    # clustering increments
    kmeans = KMeans(n_clusters = i, random_state = seed)
    
    kmeans.fit(train_scaled[['sugar_dens']])
    
    # append the inertia
    inertia.append(kmeans.inertia_)
    
# creating a df for the sugar-citric acid features to allow for graphing

inertia_sd_df = pd.DataFrame({'n_clusters' : list(range(1,7)),
                               'inertia' : inertia})

# elbow of the better number of k

sns.relplot(data = inertia_sd_df, x = 'n_clusters', y = 'inertia', kind = 'line')
plt.grid()
plt.show()