### Question 2: Are women more likely to complete secondary education in some countries than others? In the coming years, what percentage of women overall and by country, do we expect to enroll in secondary education? What factors indicate whether or not a women completes secondary education?

### Import packages and data

In [12]:
#import packages

# general
import numpy as np
import pandas as pd
import time

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report

# visualization
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
#load in the dataset
df = pd.read_csv('transformed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Year,A woman can be head of household in the same way as a man (1=yes; 0=no),A woman can choose where to live in the same way as a man (1=yes; 0=no),A woman can get a job in the same way as a man (1=yes; 0=no),A woman can obtain a judgment of divorce in the same way as a man (1=yes; 0=no),A woman can open a bank account in the same way as a man (1=yes; 0=no),A woman can register a business in the same way as a man (1=yes; 0=no),A woman can sign a contract in the same way as a man (1=yes; 0=no),A woman can travel outside her home in the same way as a man (1=yes; 0=no),...,Country Name_Ukraine,Country Name_United Arab Emirates,Country Name_United Kingdom,Country Name_United States,Country Name_Uruguay,Country Name_Uzbekistan,"Country Name_Venezuela, RB",Country Name_Viet Nam,Country Name_West Bank and Gaza,Country Name_Zimbabwe
0,0,-1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,-0.7,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
2,2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,3,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,4,0.6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0


### Prepare the data for the models

In [56]:
#create X and y dataframes
X = df.drop(columns='School enrollment, secondary, female (% gross)')
y = df[['School enrollment, secondary, female (% gross)']]

In [57]:
#split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_test

Unnamed: 0,"School enrollment, secondary, female (% gross)"
881,-0.527802
383,0.204738
112,-1.159866
181,0.250578
342,0.493991
...,...
330,3.236547
529,-0.380202
413,-0.107341
173,-0.265689


#### Feature engineering

In [43]:
#initialize sfs, use linear regression as the baseline model for this question
sfs = SequentialFeatureSelector(estimator = LinearRegression(),
                                n_features_to_select = "auto",
                                direction = 'forward',
                                scoring = 'neg_mean_squared_error',
                                cv = 10)

#fit the data to sfs
sfs = sfs.fit(X_train, y_train)

#retrieve the and print the names of the selected features
feature_names = np.array(df.columns.difference(['School enrollment, secondary, female (% gross)']))
selected_feature_names = feature_names[sfs.get_support()].tolist()
print("Selected features:", selected_feature_names)

# transform X_train and X_test to include only the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)

# display the shape of transformed X_train_selected and X_test_selected
print("Transformed X_train shape:", X_train_selected.shape)
print("Transformed X_test shape:", X_test_selected.shape)

Selected features: ['A woman can be head of household in the same way as a man (1=yes; 0=no)', 'A woman has the same rights to remarry as a man (1=yes; 0=no)', 'Age dependency ratio (% of working-age population)', 'Country Name_Algeria', 'Country Name_Angola', 'Country Name_Australia', 'Country Name_Belarus', 'Country Name_Belgium', 'Country Name_Belize', 'Country Name_Benin', 'Country Name_Bulgaria', 'Country Name_Burkina Faso', 'Country Name_Cabo Verde', 'Country Name_Cambodia', 'Country Name_Chad', 'Country Name_Colombia', 'Country Name_Denmark', 'Country Name_Ecuador', 'Country Name_El Salvador', 'Country Name_Eswatini', 'Country Name_Ethiopia', 'Country Name_France', 'Country Name_Gambia, The', 'Country Name_Georgia', 'Country Name_Grenada', 'Country Name_Guatemala', 'Country Name_Hong Kong SAR, China', 'Country Name_Hungary', 'Country Name_India', 'Country Name_Ireland', 'Country Name_Jordan', 'Country Name_Kenya', 'Country Name_Kiribati', 'Country Name_Lao PDR', 'Country Name_La

### Initial Baseline Model for Linear Regression:

In [6]:
#create a linear regression model using the selected features

#initialize the model
lr_model = LinearRegression()

#fit the model
lr_model.fit(X_train_selected, y_train)

#making predictions on the training and test sets
y_pred_train = lr_model.predict(X_train_selected)
y_pred_test = lr_model.predict(X_test_selected)

#evaluate the model using MSE and R squared
mse_train = mean_squared_error(y_train,y_pred_train)
mse_test = mean_squared_error(y_test,y_pred_test)
r_sq_train = lr_model.score(X_train_selected, y_train)
r_sq_test = lr_model.score(X_test_selected, y_test)

#print the MSE and R_squared
print('MSE (Train): ', round(mse_train, 3))
print('MSE (Test): ', round(mse_test, 3))
print('R-Squared (Train): ', round(r_sq_train, 3))
print('R-Squared (Test): ', round(r_sq_test, 3))

MSE (Train):  0.217
MSE (Test):  0.435
R-Squared (Train):  0.893
R-Squared (Test):  0.76


> The initial model does fairly okay with an r-squared of 0.876 for the training data and 0.752 for the test data. Because the test r-squared is quite a bit lower than the train, there could be some concern of overfitting. Additionally, the MSE of the test data is much worse than it is for the training data. For a simple linear regression, this model does not perform too bad, but it will be interesting to see how more advanced models perform in comparison.

### Function for the Models 

In [44]:
def question2(model_type, X_train, y_train, X_test, y_test):

    # Step 1: Initialize a model object
    if model_type == 'Linear Regression':
      model = LinearRegression()
    elif model_type == 'Ridge':
      model = Ridge()
    elif model_type == 'Lasso':
      model = Lasso()
    elif model_type == 'Random Forest':
      model = RandomForestRegressor()
    elif model_type == 'Gradient Boosting':
      model = GradientBoostingRegressor()
    elif model_type == 'ElasticNet':
      model = ElasticNet()
    else:
      print('Please specify the model type')

    # Step 2: Train the model
    model.fit(X_train, y_train)

    # Step 3: Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Step 4: Evaluate the model performance

    ## MSE
    #evaluate the model using MSE and R squared
    mse_train = mean_squared_error(y_train,y_pred_train)
    mse_test = mean_squared_error(y_test,y_pred_test)
    r_sq_train = model.score(X_train, y_train)
    r_sq_test = model.score(X_test, y_test)

    #print the MSE and R_squared
    print('MSE (Train): ', round(mse_train, 3))
    print('MSE (Test): ', round(mse_test, 3))
    print('R-Squared (Train): ', round(r_sq_train, 3))
    print('R-Squared (Test): ', round(r_sq_test, 3))

    return mse_train, mse_test, r_sq_train, r_sq_test

### Model #1: Linear Regression

In [53]:
#running the model with the selected features
print('Linear Regression:')
mse_train_lr, mse_test_lr, r_sq_train_lr, r_sq_test_lr = question2('Linear Regression', X_train_selected, y_train, X_test_selected, y_test)

Linear Regression:
MSE (Train):  0.217
MSE (Test):  0.435
R-Squared (Train):  0.893
R-Squared (Test):  0.76


### Model #2: Regularized Regression Models

In [54]:
#running the ridge model with the selected features
print('Ridge Model:')
mse_train_ridge, mse_test_ridge, r_sq_train_ridge, r_sq_test_ridge = question2('Ridge', X_train_selected, y_train, X_test_selected, y_test)
print('\n')

#running the lasso model with the selected features
print('Lasso Model:')
mse_train_lasso, mse_test_lasso, r_sq_train_lasso, r_sq_test_lasso = question2('Lasso', X_train_selected, y_train, X_test_selected, y_test)


Ridge Model:
MSE (Train):  0.236
MSE (Test):  0.454
R-Squared (Train):  0.884
R-Squared (Test):  0.749


Lasso Model:
MSE (Train):  1.789
MSE (Test):  1.679
R-Squared (Train):  0.122
R-Squared (Test):  0.072


### Model #3: Ensemble Methods

In [58]:
#reshape the target variable
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

#running the random forest model with the selected features
print('Random Forest Regressor:')
mse_train_rf, mse_test_rf, r_sq_train_rf, r_sq_test_rf = question2('Random Forest', X_train_selected, y_train, X_test_selected, y_test)
print('\n')

#running the gradient boosting model with the selected features
print('Gradient Boosting Regressor:')
mse_train_gb, mse_test_gb, r_sq_train_gb, r_sq_test_gb = question2('Gradient Boosting', X_train_selected, y_train, X_test_selected, y_test)


Random Forest Regressor:
MSE (Train):  0.355
MSE (Test):  1.881
R-Squared (Train):  0.838
R-Squared (Test):  -0.533


Gradient Boosting Regressor:
MSE (Train):  1.187
MSE (Test):  1.6
R-Squared (Train):  0.457
R-Squared (Test):  -0.304


### Model #4: ElasticNet

In [59]:
#running the elasticnet model with the selected features
print('ElasticNet Model:')
mse_train_en, mse_test_en, r_sq_train_en, r_sq_test_en = question2('ElasticNet', X_train_selected, y_train, X_test_selected, y_test)

ElasticNet Model:
MSE (Train):  2.181
MSE (Test):  1.243
R-Squared (Train):  0.001
R-Squared (Test):  -0.013


### Create a DF from the models

In [61]:
# Create a dictionary to hold evauluation metrics from the models
data = {

    'Model': [
        'Linear Regression',
        'Ridge',
        'Lasso',
        'Random Forest',
        'Gradient Boosting',
        'ElasticNet'
        ],


    'R Squared Train': [
        r_sq_train_lr,
        r_sq_train_ridge,
        r_sq_train_lasso,
        r_sq_train_rf,
        r_sq_train_gb,
        r_sq_train_en
        ],

    'R Squared Test': [
        r_sq_test_lr,
        r_sq_test_ridge,
        r_sq_test_lasso,
        r_sq_test_rf,
        r_sq_test_gb,
        r_sq_test_en
        ],

    'MSE Train': [
        mse_train_lr,
        mse_train_ridge,
        mse_train_lasso,
        mse_train_rf,
        mse_train_gb,
        mse_train_en
        ],

    'MSE Test': [
        mse_test_lr,
        mse_test_ridge,
        mse_test_lasso,
        mse_test_rf,
        mse_test_gb,
        mse_test_en
    ]
    }

# Create and display a dataframe
results_df = pd.DataFrame(data)
results_df

Unnamed: 0,Model,R Squared Train,R Squared Test,MSE Train,MSE Test
0,Linear Regression,0.893322,0.759778,0.217366,0.434821
1,Ridge,0.884281,0.749199,0.235788,0.453969
2,Lasso,0.122136,0.072213,1.788726,1.679366
3,Random Forest,0.83762,-0.53253,0.354678,1.880867
4,Gradient Boosting,0.456781,-0.303592,1.186522,1.599893
5,ElasticNet,0.001468,-0.013018,2.181037,1.243273


### References

1. Lab 3 Solutions
2. Lab 6 Solutions
3. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
4. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
5. https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html