In [1]:
# lets grab some imports to do our work:
import numpy as np
import pandas as pd
import statsmodels.api as sm
# data source
from pydataset import data
# visualizations:
import matplotlib.pyplot as plt
import seaborn as sns

# modeling:
# data splitting:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler

# statistical testing
from scipy import stats

In [2]:
swiss = data('swiss')

In [3]:
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


2. Be sure your data is prepared (no missing values, numeric datatypes) and split into samples.

In [12]:
swiss.info()

<class 'pandas.core.frame.DataFrame'>
Index: 47 entries, Courtelary to Rive Gauche
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Fertility         47 non-null     float64
 1   Agriculture       47 non-null     float64
 2   Examination       47 non-null     int64  
 3   Education         47 non-null     int64  
 4   Catholic          47 non-null     float64
 5   Infant.Mortality  47 non-null     float64
dtypes: float64(4), int64(2)
memory usage: 3.6+ KB


In [14]:
# Checking for missing values
swiss.isnull().sum()

Fertility           0
Agriculture         0
Examination         0
Education           0
Catholic            0
Infant.Mortality    0
dtype: int64

In [15]:
# Define your X (features) and y (target variable)
X = swiss.drop('Fertility', axis=1)
y = swiss['Fertility']

In [16]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
# Add a constant to the features
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

In [18]:
# Fit the regression model on the training data
model = sm.OLS(y_train, X_train_const).fit()

In [21]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Fertility   R-squared:                       0.780
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     22.00
Date:                Fri, 08 Sep 2023   Prob (F-statistic):           2.31e-09
Time:                        09:26:24   Log-Likelihood:                -119.98
No. Observations:                  37   AIC:                             252.0
Df Residuals:                      31   BIC:                             261.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               64.4971     11.070  

In [22]:
# Make predictions on the test set
y_pred = model.predict(X_test_const)

In [23]:
# Evaluate the model (you can use metrics like RMSE, MAE)
from sklearn.metrics import mean_squared_error, mean_absolute_error
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

RMSE: 11.00728968014887
MAE: 9.38058585930822


3. Work through all of the steps outlined in the lesson, from setting the baseline to selected a model and evaluating the final model on your test data.



- Setting the Baseline

In [24]:
# Compute the mean of y_train as the baseline prediction
baseline_pred = y_train.mean()


In [25]:
# Compute the baseline RMSE and MAE
baseline_rmse = mean_squared_error(y_train, [baseline_pred]*len(y_train), squared=False)
baseline_mae = mean_absolute_error(y_train, [baseline_pred]*len(y_train))


In [26]:
print(f"Baseline RMSE: {baseline_rmse}")
print(f"Baseline MAE: {baseline_mae}")

Baseline RMSE: 13.214614394721046
Baseline MAE: 10.49963476990504


- Running Initial Model and Feature Engineering


In [27]:
# Fit the regression model on the training data
model = sm.OLS(y_train, X_train_const).fit()

In [28]:
# Check the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              Fertility   R-squared:                       0.780
Model:                            OLS   Adj. R-squared:                  0.745
Method:                 Least Squares   F-statistic:                     22.00
Date:                Fri, 08 Sep 2023   Prob (F-statistic):           2.31e-09
Time:                        09:28:32   Log-Likelihood:                -119.98
No. Observations:                  37   AIC:                             252.0
Df Residuals:                      31   BIC:                             261.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const               64.4971     11.070  

- Model Selection and Tuning
- Evaluating Model on Test Data

In [29]:
# Make predictions on the test set
y_pred = model.predict(X_test_const)


In [30]:
# Evaluate the model using RMSE and MAE
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)


In [31]:
print(f"Final RMSE: {rmse}")
print(f"Final MAE: {mae}")

Final RMSE: 11.00728968014887
Final MAE: 9.38058585930822


In [32]:
# Comparing to Baseline

print(f"Improvement over baseline in RMSE: {baseline_rmse - rmse}")
print(f"Improvement over baseline in MAE: {baseline_mae - mae}")


Improvement over baseline in RMSE: 2.2073247145721755
Improvement over baseline in MAE: 1.1190489105968204


Summary of Key Findings

- Baseline Model: Began by stating the RMSE and MAE of your baseline model. 
- Best Model:  OLS regression model that used all features.

