### Question 2: Are women more likely to complete secondary education in some countries than others? In the coming years, what percentage of women overall and by country, do we expect to enroll in secondary education? What factors indicate whether or not a women completes secondary education?

### Import packages and data

In [84]:
#import packages

# general
import numpy as np
import pandas as pd
import time

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

# visualization
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [85]:
#load in the dataset
df = pd.read_csv('transformed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Year,A woman can be head of household in the same way as a man (1=yes; 0=no),A woman can choose where to live in the same way as a man (1=yes; 0=no),A woman can get a job in the same way as a man (1=yes; 0=no),A woman can obtain a judgment of divorce in the same way as a man (1=yes; 0=no),A woman can open a bank account in the same way as a man (1=yes; 0=no),A woman can register a business in the same way as a man (1=yes; 0=no),A woman can sign a contract in the same way as a man (1=yes; 0=no),A woman can travel outside her home in the same way as a man (1=yes; 0=no),...,Country Name_Ukraine,Country Name_United Arab Emirates,Country Name_United Kingdom,Country Name_United States,Country Name_Uruguay,Country Name_Uzbekistan,"Country Name_Venezuela, RB",Country Name_Viet Nam,Country Name_West Bank and Gaza,Country Name_Zimbabwe
0,0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,-0.7,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,3,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,4,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


### Prepare the data for the models

In [86]:
#create X and y dataframes
X = df.drop(columns='School enrollment, secondary, female (% gross)')
y = df[['School enrollment, secondary, female (% gross)']]

In [87]:
#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)
y_test

Unnamed: 0,"School enrollment, secondary, female (% gross)"
396,0.004843
227,0.115674
673,0.000000
702,0.000000
643,-1.464595
...,...
35,0.000000
870,-1.331799
503,0.008743
472,-0.075033


#### Feature engineering

In [43]:
#initialize sfs, use linear regression as the baseline model for this question
sfs = SequentialFeatureSelector(estimator = LinearRegression(),
                                n_features_to_select = "auto",
                                direction = 'forward',
                                scoring = 'neg_mean_squared_error',
                                cv = 10)

#fit the data to sfs
sfs = sfs.fit(X_train, y_train)

#retrieve the and print the names of the selected features
feature_names = np.array(df.columns.difference(['School enrollment, secondary, female (% gross)']))
selected_feature_names = feature_names[sfs.get_support()].tolist()
print("Selected features:", selected_feature_names)

# transform X_train and X_test to include only the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# display the shape of transformed X_train_selected and X_test_selected
print("Transformed X_train shape:", X_train_selected.shape)
print("Transformed X_test shape:", X_test_selected.shape)

Selected features: ['A woman can be head of household in the same way as a man (1=yes; 0=no)', 'A woman has the same rights to remarry as a man (1=yes; 0=no)', 'Age dependency ratio (% of working-age population)', 'Country Name_Algeria', 'Country Name_Angola', 'Country Name_Australia', 'Country Name_Belarus', 'Country Name_Belgium', 'Country Name_Belize', 'Country Name_Benin', 'Country Name_Bulgaria', 'Country Name_Burkina Faso', 'Country Name_Cabo Verde', 'Country Name_Cambodia', 'Country Name_Chad', 'Country Name_Colombia', 'Country Name_Denmark', 'Country Name_Ecuador', 'Country Name_El Salvador', 'Country Name_Eswatini', 'Country Name_Ethiopia', 'Country Name_France', 'Country Name_Gambia, The', 'Country Name_Georgia', 'Country Name_Grenada', 'Country Name_Guatemala', 'Country Name_Hong Kong SAR, China', 'Country Name_Hungary', 'Country Name_India', 'Country Name_Ireland', 'Country Name_Jordan', 'Country Name_Kenya', 'Country Name_Kiribati', 'Country Name_Lao PDR', 'Country Name_La

### Initial Baseline Model for Linear Regression:

In [88]:
#create a linear regression model using the selected features

#initialize the model
lr_model = LinearRegression()

#fit the model
lr_model.fit(X_train_selected, y_train)

#making predictions on the training and test sets
y_pred_train = lr_model.predict(X_train_selected)
y_pred_test = lr_model.predict(X_test_selected)

#evaluate the model using MSE and R squared
mse_train = mean_squared_error(y_train,y_pred_train)
mse_test = mean_squared_error(y_test,y_pred_test)
r_sq_train = lr_model.score(X_train_selected, y_train)
r_sq_test = lr_model.score(X_test_selected, y_test)

#print the MSE and R_squared
print('MSE (Train): ', round(mse_train, 3))
print('MSE (Test): ', round(mse_test, 3))
print('R-Squared (Train): ', round(r_sq_train, 3))
print('R-Squared (Test): ', round(r_sq_test, 3))

MSE (Train):  0.217
MSE (Test):  0.435
R-Squared (Train):  0.893
R-Squared (Test):  0.76


> The initial model does fairly okay with an r-squared of 0.876 for the training data and 0.752 for the test data. Because the test r-squared is quite a bit lower than the train, there could be some concern of overfitting. Additionally, the MSE of the test data is much worse than it is for the training data. For a simple linear regression, this model does not perform too bad, but it will be interesting to see how more advanced models perform in comparison.

### Function for the Models 

In [89]:
def question2(model_type, X_train, y_train, X_test, y_test):

    # Step 1: Initialize a model object
    if model_type == 'Linear Regression':
      model = LinearRegression()
    elif model_type == 'Ridge':
      model = Ridge()
    elif model_type == 'Lasso':
      model = Lasso()
    elif model_type == 'Random Forest':
      model = RandomForestRegressor()
    elif model_type == 'Gradient Boosting':
      model = GradientBoostingRegressor()
    elif model_type == 'ElasticNet':
      model = ElasticNet()
    else:
      print('Please specify the model type')

    # Step 2: Train the model
    model.fit(X_train, y_train)

    # Step 3: Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Step 4: Evaluate the model performance

    ## MSE
    #evaluate the model using MSE and R squared
    mse_train = mean_squared_error(y_train,y_pred_train)
    mse_test = mean_squared_error(y_test,y_pred_test)
    r_sq_train = model.score(X_train, y_train)
    r_sq_test = model.score(X_test, y_test)

    #print the MSE and R_squared
    print('MSE (Train): ', round(mse_train, 3))
    print('MSE (Test): ', round(mse_test, 3))
    print('R-Squared (Train): ', round(r_sq_train, 3))
    print('R-Squared (Test): ', round(r_sq_test, 3))

    return mse_train, mse_test, r_sq_train, r_sq_test

### Model #1: Linear Regression

In [90]:
#running the model with the selected features
print('Linear Regression:')
mse_train_lr, mse_test_lr, r_sq_train_lr, r_sq_test_lr = question2('Linear Regression', X_train_selected, y_train, X_test_selected, y_test)

Linear Regression:
MSE (Train):  0.217
MSE (Test):  0.435
R-Squared (Train):  0.893
R-Squared (Test):  0.76


### Model #2: Regularized Regression Models

In [91]:
#running the ridge model with the selected features
print('Ridge Model:')
mse_train_ridge, mse_test_ridge, r_sq_train_ridge, r_sq_test_ridge = question2('Ridge', X_train_selected, y_train, X_test_selected, y_test)
print('\n')

#running the lasso model with the selected features
print('Lasso Model:')
mse_train_lasso, mse_test_lasso, r_sq_train_lasso, r_sq_test_lasso = question2('Lasso', X_train_selected, y_train, X_test_selected, y_test)


Ridge Model:
MSE (Train):  0.236
MSE (Test):  0.454
R-Squared (Train):  0.884
R-Squared (Test):  0.749


Lasso Model:
MSE (Train):  1.789
MSE (Test):  1.679
R-Squared (Train):  0.122
R-Squared (Test):  0.072


### Model #3: Ensemble Methods

In [92]:
#reshape the target variable
y_train_ensemble = y_train.values.ravel()
y_test_ensemble = y_test.values.ravel()

#running the random forest model with the selected features
print('Random Forest Regressor:')
mse_train_rf, mse_test_rf, r_sq_train_rf, r_sq_test_rf = question2('Random Forest', X_train_selected, y_train_ensemble, X_test_selected, y_test_ensemble)
print('\n')

#running the gradient boosting model with the selected features
print('Gradient Boosting Regressor:')
mse_train_gb, mse_test_gb, r_sq_train_gb, r_sq_test_gb = question2('Gradient Boosting', X_train_selected, y_train_ensemble, X_test_selected, y_test_ensemble)


Random Forest Regressor:
MSE (Train):  0.029
MSE (Test):  0.306
R-Squared (Train):  0.986
R-Squared (Test):  0.831


Gradient Boosting Regressor:
MSE (Train):  0.13
MSE (Test):  0.377
R-Squared (Train):  0.936
R-Squared (Test):  0.792


### Model #4: ElasticNet

In [93]:
#running the elasticnet model with the selected features
print('ElasticNet Model:')
mse_train_en, mse_test_en, r_sq_train_en, r_sq_test_en = question2('ElasticNet', X_train_selected, y_train, X_test_selected, y_test)

ElasticNet Model:
MSE (Train):  1.332
MSE (Test):  1.348
R-Squared (Train):  0.346
R-Squared (Test):  0.255


### Create a DF from the models

In [94]:
# Create a dictionary to hold evauluation metrics from the models
data = {

    'Model': [
        'Linear Regression',
        'Ridge',
        'Lasso',
        'Random Forest',
        'Gradient Boosting',
        'ElasticNet'
        ],


    'R Squared Train': [
        r_sq_train_lr,
        r_sq_train_ridge,
        r_sq_train_lasso,
        r_sq_train_rf,
        r_sq_train_gb,
        r_sq_train_en
        ],

    'R Squared Test': [
        r_sq_test_lr,
        r_sq_test_ridge,
        r_sq_test_lasso,
        r_sq_test_rf,
        r_sq_test_gb,
        r_sq_test_en
        ],

    'MSE Train': [
        mse_train_lr,
        mse_train_ridge,
        mse_train_lasso,
        mse_train_rf,
        mse_train_gb,
        mse_train_en
        ],

    'MSE Test': [
        mse_test_lr,
        mse_test_ridge,
        mse_test_lasso,
        mse_test_rf,
        mse_test_gb,
        mse_test_en
    ]
    }

# Create and display a dataframe
results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Model,R Squared Train,R Squared Test,MSE Train,MSE Test
0,Linear Regression,0.893322,0.759778,0.217366,0.434821
1,Ridge,0.884281,0.749199,0.235788,0.453969
2,Lasso,0.122136,0.072213,1.788726,1.679366
3,Random Forest,0.98573,0.830846,0.029077,0.306181
4,Gradient Boosting,0.936215,0.791773,0.129968,0.376907
5,ElasticNet,0.346111,0.255076,1.332356,1.348369


> The linear regression model performs with a r-squared train of 0.893 and a r-squared test of 0.760. The ridge model also performs pretty well, and the random forest performs pretty well on training but not on test data. We will move forward with hypertuning the linear regression, the ridge, the random forest, and the elastic.

## Hyperparameter tuning

### Model #1: Linear Regression

This model does not have any useful parameters to tune, so we will not use hypertuning for it.

### Model #2: Ridge

#### Testing out different alphas

In [95]:
# Generate a random list of regularization parameters (alphas)
# Smaller alphas --> weaker regularization, higher alphas --> stronger regularization
alphas = np.logspace(-2, 8, 100)

# Create a list where you will store the information
model_ridge_coefficients = []
mse_train_ridge_list = []
mse_test_ridge_list = []

# Try each set of alphas in your Ridge model
for a in alphas:
    # Step 1: Pick the model type by initializing a model object and set the alpha from the list as a model parameter
    model_ridge = Ridge()
    model_ridge.set_params(alpha=a)

    # Step 2: Train the model by passing some data
    model_ridge.fit(X_train_selected, y_train)

    # Step 3: Get predictions
    y_pred_train_ridge_experiment = model_ridge.predict(X_train_selected)
    y_pred_test_ridge_experiment = model_ridge.predict(X_test_selected)

    # Step 4: Evaluate the model performance
    mse_train_ridge = mean_squared_error(y_train, y_pred_train_ridge_experiment)
    mse_test_ridge = mean_squared_error(y_test, y_pred_test_ridge_experiment)

    # Append all results to the lists so we could look at them and also plot them
    model_ridge_coefficients.append(model_ridge.coef_)
    mse_train_ridge_list.append(mse_train_ridge)
    mse_test_ridge_list.append(mse_test_ridge)


#Find and print the best alpha

# Find the lowest value of MSE test value
min_mse_test_ridge = np.min(mse_test_ridge_list)
print("Min MSE Test: ", min_mse_test_ridge)

# Find the index of the lowest MSE test value (use np.argmin)
min_mse_test_ridge_index = np.argmin(mse_test_ridge_list)
print("Index of Min MSE test: ", min_mse_test_ridge_index)

# Call alphas with the index of the lowest MSE test value
best_alpha = alphas[min_mse_test_ridge_index]
print("Best alpha: ", best_alpha)

Min MSE Test:  0.4344895504029588
Index of Min MSE test:  8
Best alpha:  0.06428073117284319


#### Use the best alpha in the model

In [96]:
# Step 1: Pick the model type by initializing a model object and set the alpha to the best alpha
model_ridge_best_alpha = Ridge()
model_ridge_best_alpha.set_params(alpha=best_alpha)

# Step 2: Train the model by passing some data. Again, make sure to pass pre-processed X_train and X_test
model_ridge_best_alpha.fit(X_train_selected, y_train)

# Step 3: Get predictions
y_pred_train_ridge_best_alpha = model_ridge_best_alpha.predict(X_train_selected)
y_pred_test_ridge_best_alpha = model_ridge_best_alpha.predict(X_test_selected)

# Step 4: Evaluate the model performance
r_squared_train_ridge_best_alpha = model_ridge_best_alpha.score(X_train_selected, y_train)
r_squared_test_ridge_best_alpha = model_ridge_best_alpha.score(X_test_selected, y_test)
mse_train_ridge_best_alpha = mean_squared_error(y_train, y_pred_train_ridge_best_alpha)
mse_test_ridge_best_alpha = mean_squared_error(y_test, y_pred_test_ridge_best_alpha)

print("Best Ridge Train R^2: ", round(r_squared_train_ridge_best_alpha, 3))
print("Best Ridge Test R^2: ", round(r_squared_test_ridge_best_alpha, 3))
print("Best Ridge Train MSE: ", round(mse_train_ridge_best_alpha, 3))
print("Best Ridge Test MSE: ", round(mse_test_ridge_best_alpha, 3))

Best Ridge Train R^2:  0.893
Best Ridge Test R^2:  0.76
Best Ridge Train MSE:  0.218
Best Ridge Test MSE:  0.434


### GridsearchCV Function

In [97]:
def grid_search_cv_regressor(model, X_train, y_train, X_test, y_test, parameters):
    # Step 1: Initialize a GridSearchCV object by passing your model, parameters and cv=10
    model_grid_search = GridSearchCV(model, parameters, cv=5)
    
    # Step 2: Train the GridSearchCV (it will build multiply models and return the best)
    model_grid_search.fit(X_train_selected, y_train)

    ## Step 2.1: Print the best parameters
    print("Best Parameters:", model_grid_search.best_params_)

    ## Step 2.2: Save the best model into a separate variable to be used later
    model_finetuned = model_grid_search.best_estimator_

    # Steps 3-4: Evaluate the model performance

    ## r-squared
    r_squared_train_finetuned = model_finetuned.score(X_train_selected, y_train)
    r_squared_test_finetuned = model_finetuned.score(X_test_selected, y_test)
    
    print('R-squared (Train):', round(r_squared_train_finetuned, 3))
    print('R-squared (Test):', round(r_squared_test_finetuned, 3))
    
    return model_finetuned, r_squared_train_finetuned, r_squared_test_finetuned

### References

1. Lab 3 Solutions
2. Lab 6 Solutions
3. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
4. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
5. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
6. Lab 8 Solutions