Use Lasso to Reduce Dimension

In [40]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler

In [41]:
df = pd.read_csv('../../data/combined_data/merged_052423.csv')

In [42]:
print(df.dtypes)
print("Shape of the Dataset: {}".format(df.shape))

country                        object
sect                            int64
t                               int64
source                         object
outp                          float64
outpd                         float64
outptt                        float64
gvco                          float64
gvcobp                        float64
gvcofp                        float64
gvcomix                       float64
gvcobp%                       float64
gvcofp%                       float64
gvcomix%                      float64
gvcobp_diff                   float64
gvcofp_diff                   float64
gvcomix_diff                  float64
Unnamed: 0                      int64
onset2COWCS                   float64
d2incidenceU                  float64
d3_6incidenceU                float64
onsetUCS                      float64
coup                          float64
periregular                   float64
milexp_pergdpSIPRI            float64
decade                        float64
ecgrowth    

Lasso Feature selection for outcome var Defense Share

In [43]:
def clean_data(df, feature_columns, target_columns):
    df_clean = df[feature_columns + target_columns]
    df_clean = df_clean.dropna()
    return df_clean

def get_features_targets(df_clean, feature_columns, target_name):
    x = df_clean[feature_columns].values
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    y = df_clean[target_name].values
    return x, y

def split_data(x, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

def perform_grid_search(x, y, params, cv_splits=5, random_state=42):
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    lasso = Lasso()
    lasso_cv = GridSearchCV(lasso, param_grid=params, cv=kf)
    lasso_cv.fit(x, y)
    return lasso_cv.best_params_

def get_column_names(df, feature_columns):
    return df[feature_columns].columns

def fit_model(X_train, y_train, best_alpha):
    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train, y_train)
    return lasso

feature_columns = [ 'decade','religion_fractionalization',
                     'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff',
                   'logmountain', 'ethnic_fractionalization','logpopdens_diff',
                   'leg_british']
target_columns = ['milexp_pergdpSIPRI']
#  'oilreserves_full', 'oilreserves_public','language_fractionalization', 'opec''logpop_M_diff''logpopdens_diff',

df_clean = clean_data(df, feature_columns, target_columns)

target_name = 'milexp_pergdpSIPRI' 

x, y_bp = get_features_targets(df_clean, feature_columns, target_name)

X_train, X_test, y_train, y_test = split_data(x, y_bp)

params = {"alpha": np.arange(0.00001, 10, 0.1)}

best_params = perform_grid_search(x, y_bp, params)
print("Best Params:", best_params)

names = get_column_names(df, feature_columns)

lasso_model = fit_model(X_train, y_train, best_params['alpha'])




Best Params: {'alpha': 1e-05}


In [44]:
from sklearn.preprocessing import MinMaxScaler

def print_feature_importance(names, coefficients):
    for name, coef in zip(names, coefficients):
        print(f"{name}: {coef}")

for target_name in target_columns:
    print(f"\nAnalyzing target: {target_name}")

    x, y = get_features_targets(df_clean, feature_columns, target_name)

    X_train, X_test, y_train, y_test = split_data(x, y)

    best_params = perform_grid_search(x, y, params)
    print("Best Params:", best_params)

    lasso_model = fit_model(X_train, y_train, best_params['alpha'])

    print("Feature importances:")
    print_feature_importance(names, lasso_model.coef_)



Analyzing target: milexp_pergdpSIPRI
Best Params: {'alpha': 1e-05}
Feature importances:
decade: -0.3226722775700083
religion_fractionalization: -0.5059015693467603
logoutreg_diff: -0.055707067449951735
ecgrowth_demeaned: -0.9008740209437696
democracy_diff: -0.026083757113776338
logmountain: 0.30185122351326765
ethnic_fractionalization: -1.288577205819826
logpopdens_diff: 27.042445355941087
leg_british: 0.6859027444546643


Lasso Feature selection for outcome var Oil Reserves

In [45]:
def clean_data(df, feature_columns, target_columns):
    df_clean = df[feature_columns + target_columns]
    df_clean = df_clean.dropna()
    return df_clean

def get_features_targets(df_clean, feature_columns, target_name):
    x = df_clean[feature_columns].values
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)
    y = df_clean[target_name].values
    return x, y

def split_data(x, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=random_state, stratify=y)
    return X_train, X_test, y_train, y_test

def perform_grid_search(x, y, params, cv_splits=5, random_state=42):
    kf = KFold(n_splits=cv_splits, shuffle=True, random_state=random_state)
    lasso = Lasso()
    lasso_cv = GridSearchCV(lasso, param_grid=params, cv=kf)
    lasso_cv.fit(x, y)
    return lasso_cv.best_params_

def get_column_names(df, feature_columns):
    return df[feature_columns].columns

def fit_model(X_train, y_train, best_alpha):
    lasso = Lasso(alpha=best_alpha)
    lasso.fit(X_train, y_train)
    return lasso


feature_columns = [ 'decade','religion_fractionalization',
                     'logoutreg_diff', 'ecgrowth_demeaned', 'democracy_diff',
                   'logmountain', 'ethnic_fractionalization','logpopdens_diff',
                   'leg_british']
target_columns = ['oilreserves']
#  'oilreserves_full', 'oilreserves_public','language_fractionalization', 'opec''logpop_M_diff'

df_clean = clean_data(df, feature_columns, target_columns)

target_name = 'oilreserves' 

x, y_bp = get_features_targets(df_clean, feature_columns, target_name)

X_train, X_test, y_train, y_test = split_data(x, y_bp)

params = {"alpha": np.arange(0.00001, 10, 0.1)}

best_params = perform_grid_search(x, y_bp, params)
print("Best Params:", best_params)

names = get_column_names(df, feature_columns)

lasso_model = fit_model(X_train, y_train, best_params['alpha'])




Best Params: {'alpha': 1e-05}


In [46]:
from sklearn.preprocessing import MinMaxScaler

def print_feature_importance(names, coefficients):
    for name, coef in zip(names, coefficients):
        print(f"{name}: {coef}")

for target_name in target_columns:
    print(f"\nAnalyzing target: {target_name}")

    x, y = get_features_targets(df_clean, feature_columns, target_name)

    X_train, X_test, y_train, y_test = split_data(x, y)

    best_params = perform_grid_search(x, y, params)
    print("Best Params:", best_params)

    lasso_model = fit_model(X_train, y_train, best_params['alpha'])

    print("Feature importances:")
    print_feature_importance(names, lasso_model.coef_)
    


Analyzing target: oilreserves
Best Params: {'alpha': 1e-05}
Feature importances:
decade: -0.12037126744379846
religion_fractionalization: -15.705552441590484
logoutreg_diff: -1.0667419192869902
ecgrowth_demeaned: -18.368003890582003
democracy_diff: 3.490869592763109
logmountain: 5.1674361902655574
ethnic_fractionalization: -10.721026747310745
logpopdens_diff: 107.65225538261951
leg_british: 7.1822004454507935
