In [18]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [None]:
diamonds = sns.load_dataset('diamonds')
diamonds.info()

In [None]:
diamonds

In [None]:
diamonds.nunique()

In [None]:
diamonds.cut.value_counts()

In [None]:
def uncategorize(col):
    if col.dtype.name == 'category':
        try:
            return col.astype(col.cat.categories.dtype)
        except:
            # In case there is pd.NA (pandas >= 1.0), Int64 should be used instead of int64
            return col.astype(col.cat.categories.dtype.name.title())           
    else:
        return col
    
diamonds.apply(uncategorize, axis=0)
diamonds.info()

In [None]:
categories = ['cut', 'color', 'clarity']
dummies = pd.get_dummies(diamonds[categories])
dummies.info()

In [None]:
diamondDummies = diamonds.drop(columns=categories)
diamondDummies = diamondDummies.join(dummies)
diamondDummies.info()

In [None]:
diamondDummies

In [None]:
diamonds.info()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numCols = ['carat', 'depth', 'table', 'x', 'y', 'z']
diamondDummies[numCols] = scaler.fit_transform(diamondDummies[numCols])
diamondDummies.head()

In [None]:
diamondDummies.corr()[['price']].sort_values(by='price', ascending=False)

In [None]:
# Question 1

fig, ax = plt.subplots(figsize=(8,8))
sns.heatmap(data=diamondDummies.corr()[['price']].sort_values(by='price', ascending=False),
            annot=True, cmap='viridis', fmt=f'.2f')

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    diamonds[['carat']], diamonds[['price']],
    test_size=0.30, random_state=20)
linearModel = LinearRegression()
linearModel.fit(x_train, y_train)
linearModel.score(x_test, y_test)

In [None]:
test = x_testy_test

In [None]:
y_predicted = linearModel.predict(x_test)
y_predicted

In [None]:
predicted = pd.DataFrame(y_predicted, columns=['predicted_price'])
predicted

In [None]:
combined = predicted.join([x_test.reset_index(drop=True),
                          y_test.reset_index(drop=True)])
combined

In [None]:
melted = pd.melt(combined, id_vars=['carat'],
                 value_vars=['price', 'predicted_price'],
                 var_name='price_type', value_name='price_value')
melted

In [None]:
sns.relplot(data=melted, kind='line', x='carat', y='price_value', hue='price_type', errorbar=None)

In [None]:
sns.relplot(data=melted, kind='line', x='carat', y='price_value', hue='price_type', ci=None)

In [None]:
mpg = sns.load_dataset('mpg')
mpg.info()

In [None]:
mpg.corr()

In [None]:
mpg.corr()[['mpg']].sort_values(by='mpg', ascending=False)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    mpg[['weight']], mpg[['mpg']],
    test_size=0.20, random_state=20)
model = LinearRegression()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
model.score(x_train, y_train)

In [36]:
y_predicted = model.predict(x_test)

In [None]:
predicted = pd.DataFrame(y_predicted, columns=['predicted_mpg'])
predicted

In [None]:
combined = predicted.join([x_test.reset_index(drop=True),
                           y_test.reset_index(drop=True)])
combined

In [None]:
combined['residuals'] = combined.mpg - combined.predicted_mpg
combined.head()

In [None]:
sns.displot(data=combined, x='residuals', kind='kde')


In [None]:
sns.displot(data=combined, x='residuals', kind='kde')
plt.axvline(x=0, color='green', linestyle='--')
plt.axvline(x=-1, color='red', linestyle='--')
plt.axvline(x=1, color='blue', linestyle='--')
plt.show()

In [None]:
melted = pd.melt(combined, id_vars=['weight'],
                 value_vars=['mpg', 'predicted_mpg'],
                 var_name='mpg_type', value_name='mpg_value')
melted

In [None]:
sns.relplot(melted, x='weight', y='mpg_value', hue='mpg_type')