# Regression drill

This is a simple regression drill. Try to write the minimal code to finish the pipeline.

1. Load and store the data from the existing `diabetes` dataset from `scikit-learn`.

In [112]:

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets

diabetes = sklearn.datasets.load_diabetes(as_frame=True)

df = diabetes["frame"]

df


Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930,220.0


2. Explore your data by viewing and calculating some basic statistical details like percentiles and mean. Use the pandas library!

In [113]:



np.mean(df)
np.median(df)
np.quantile(df, np.linspace(0,1,5))
df.describe()



Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17,152.133484
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,77.093005
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672,25.0
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903,87.0
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698,140.5
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705,211.5
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118,346.0


3. Create some scatter plots or any other kind of plots that help you understand the data.

In [None]:
features = df.columns[:-1] 
# Create a figure with subplots
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(15, 20))

# Flatten the axes array for easy iteration
axes = axes.flatten()


# Create scatter plots for each feature
for i, feature in enumerate(features):
    axes[i].scatter(df[feature], df["target"], color='blue', marker='o')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Disease Progression')
    axes[i].set_title(f'{feature} vs Disease Progression')

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
# Create a figure with subplots
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(15, 20))

# Flatten the axes array for easy iteration
axes = axes.flatten()

# Create scatter plots for each feature
for i, feature in enumerate(features):
    sns.boxplot(x=df[feature], ax=axes[i])
    axes[i].set_xlabel(feature)
    axes[i].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Create a pair plot for the first few features and the target variable
sns.pairplot(df.iloc[:, :10].assign(target=df['target']), diag_kind='kde')
plt.show()


In [None]:
# For demonstration, let's create a categorical feature
df['age_group'] = pd.cut(df['age'], bins=20 )

# Create a bar plot to show the average disease progression for each age group
df.groupby('age_group')['target'].mean().plot(kind='bar', color='blue')
plt.xlabel('Age Group')
plt.ylabel('Average Disease Progression')
plt.title('Average Disease Progression by Age Group')
plt.show()

4. Create a model and compute the accuracy of its predictions. 

In [122]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1].values

y = np.array(df["target"])
X.shape
y.shape


X_train, X_test, y_train, y_test = train_test_split(X, y , random_state=42, test_size=0.2)


regressor = LinearRegression().fit(X_train, y_train)
regressor.score(X_train, y_train)


0.4526027629719197

In [123]:
regressor.predict(X_test)


array([139.5475584 , 179.51720835, 134.03875572, 291.41702925,
       123.78965872,  92.1723465 , 258.23238899, 181.33732057,
        90.22411311, 108.63375858,  94.13865744, 168.43486358,
        53.5047888 , 206.63081659, 100.12925869, 130.66657085,
       219.53071499, 250.7803234 , 196.3688346 , 218.57511815,
       207.35050182,  88.48340941,  70.43285917, 188.95914235,
       154.8868162 , 159.36170122, 188.31263363, 180.39094033,
        47.99046561, 108.97453871, 174.77897633,  86.36406656,
       132.95761215, 184.53819483, 173.83220911, 190.35858492,
       124.4156176 , 119.65110656, 147.95168682,  59.05405241,
        71.62331856, 107.68284704, 165.45365458, 155.00975931,
       171.04799096,  61.45761356,  71.66672581, 114.96732206,
        51.57975523, 167.57599528, 152.52291955,  62.95568515,
       103.49741722, 109.20751489, 175.64118426, 154.60296242,
        94.41704366, 210.74209145, 120.2566205 ,  77.61585399,
       187.93203995, 206.49337474, 140.63167076, 105.59

In [124]:
regressor.score(X_test, y_test)

0.4526027629719197

5. Play with other datasets such as `boston` from scikit-learn, or find datasets online such as [this one](https://www.kaggle.com/dmvreddy91/usahousing) or [this one](https://www.kaggle.com/hellbuoy/car-price-prediction). Careful, not all the datasets are appropriate for regression, so make sure to specify that in your search.