In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso,LinearRegression,LassoCV,LassoLars
from sklearn.metrics import mean_squared_error

# Create simulated data

### Parameters

In [None]:
true_par = np.array([[0],[0],[0],[0],[0],[0],[1],[0.01],[10],[100]])
p = 10
n_train_min = 10
step = 10
n_train_max = 2000 + step
n_test = 150
iteration_times = 30
gamma = [1,5,10]

### Training data

In [None]:
std_scalar = StandardScaler()
for i in range(n_train_min,n_train_max,step):
    if os.path.isdir("datasets/adaptive_lasso/n="+str(i))==False:
        os.makedirs("datasets/adaptive_lasso/n="+str(i))

    for iteration in range(iteration_times):
        X = np.random.randn(i,p)
        X_standardized = std_scalar.fit_transform(X)
        e = np.random.normal(0,0.75,(i,1))
        y = X_standardized@(true_par) + e
        y_demean = y - y.mean()
        df = pd.DataFrame(X_standardized)
        df["error"] = e
        df["prediction"] = y_demean
        df.to_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv")

# Perfom OLS and compute the weights used in adaptive lasso

### Perform OLS

In [None]:
for i in range(n_train_min,n_train_max,step):
    for iteration in range(iteration_times):
        df = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv",index_col=0)
        X = df.iloc[:,:10].values
        y = df.loc[:,"prediction"].values

        ols_reg = LinearRegression(fit_intercept=False)
        ols_reg.fit(X,y)
        coef_df = pd.DataFrame(ols_reg.coef_,columns=["ols_coef"],index=df.columns[:-2])
        df = pd.concat([df,coef_df.T])
        df.to_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv")

### Compute weights

In [None]:
for i in range(n_train_min,n_train_max,step):
    for iteration in range(iteration_times):
        df = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv",index_col=0)
        ols_coef = df.loc["ols_coef",:].values
        weight_df = pd.DataFrame([])
        for r in gamma:
            weight_se = pd.DataFrame(abs(ols_coef)**(-r),columns=["weights(r="+str(r)+")"],index=df.columns)
            weight_df = weight_df.append(weight_se.T,ignore_index=False)
        df = pd.concat([df,weight_df])
        df.to_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv")

# Adjust features (X)

In [None]:
for i in range(n_train_min,n_train_max,step):
    for iteration in range(iteration_times):
        adjusted_df_1,adjusted_df_5,adjusted_df_10 = pd.DataFrame([]),pd.DataFrame([]),pd.DataFrame([])
        df = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv",index_col=0)
        X = df.iloc[:i,:10].values
        weights = df.iloc[(i+1):,:10].values

        for col in range(10):
            X_ = X[:,col]
            w_ = weights[:,col]
            for r,df in zip(range(3),[adjusted_df_1,adjusted_df_5,adjusted_df_10]):
                adjusted_X = X_ / w_[r]
                df[str(col)] = adjusted_X.T
        for r,df in zip(gamma,[adjusted_df_1,adjusted_df_5,adjusted_df_10]):
            if os.path.isdir("datasets/adaptive_lasso/n="+str(i)+"/adjusted")==False:
                os.makedirs("datasets/adaptive_lasso/n="+str(i)+"/adjusted")
            df.to_csv("datasets/adaptive_lasso/n="+str(i)+"/adjusted"+"/iteration="+str(iteration+1)+"_r="+str(r)+".csv")

# Perform Lasso

In [None]:
mse_df_lasso = pd.DataFrame([])
for i in range(n_train_min,n_train_max,step):
    mse_ls = []
    for iteration in range(iteration_times):
        df = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv",index_col=0)
        X = df.iloc[:i,:10].values
        y = df.iloc[:i,-1].values

        lasso_reg = Lasso(alpha=0.5,max_iter=5000,fit_intercept=False)
        lasso_reg.fit(X,y)
        lasso_coef = lasso_reg.coef_
        mse = mean_squared_error(true_par,lasso_coef)
        mse_ls.append(mse)
    mse_se_lasso = pd.DataFrame(mse_ls,columns=["n="+str(i)])
    mse_df_lasso = mse_df_lasso.append(mse_se_lasso.T,ignore_index=False)

# Perform adaptive lasso

In [None]:
mse_df_alasso1,mse_df_alasso5,mse_df_alasso10 = pd.DataFrame([]),pd.DataFrame([]),pd.DataFrame([])
for i in range(n_train_min,n_train_max,step):
    mse_ls_alasso1,mse_ls_alasso5,mse_ls_alasso10 = [],[],[]
    for iteration in range(iteration_times):
        for r,r_,ls in zip(gamma,range(3),[mse_ls_alasso1,mse_ls_alasso5,mse_ls_alasso10]):
            df_adjusted = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/adjusted/iteration="+str(iteration+1)+"_r="+str(r)+".csv",index_col=0)
            X = df_adjusted.iloc[:i,:10].values
            df_original = pd.read_csv("datasets/adaptive_lasso/n="+str(i)+"/iteration="+str(iteration+1)+".csv",index_col=0)
            y = df_original.iloc[:i,-1].values
            weights = df_original.iloc[(i+1):,:10].values[r_,:]

            lasso_reg = Lasso(alpha=0.5,max_iter=5000,fit_intercept=False)
            lasso_reg.fit(X,y)
            alasso_coef = lasso_reg.coef_
            for f in range(10):
                alasso_coef[f] = alasso_coef[f] / weights[f]

            mse = mean_squared_error(true_par,alasso_coef)
            ls.append(mse)
    mse_df_alasso1 = mse_df_alasso1.append(pd.DataFrame(mse_ls_alasso1,columns=["n="+str(i)]).T,ignore_index=False)
    mse_df_alasso5 = mse_df_alasso5.append(pd.DataFrame(mse_ls_alasso5,columns=["n="+str(i)]).T,ignore_index=False)
    mse_df_alasso10 = mse_df_alasso10.append(pd.DataFrame(mse_ls_alasso10,columns=["n="+str(i)]).T,ignore_index=False)

# Plot Figure_2 

In [None]:
mean_mse_table = pd.concat([df.mean(axis=1) for df in [mse_df_lasso,mse_df_alasso1,mse_df_alasso5,mse_df_alasso10]],axis=1)
mean_mse_table.columns = ["lasso","alasso(r=1)","alasso(r=5)","alasso(r=10)"]
mean_mse_table

In [None]:
plt.figure(figsize=(10,8))
for index,name in zip(range(4),mean_mse_table.columns):
    plt.plot(range(20,n_train_max,step),mean_mse_table.iloc[1:,index],label=name)
plt.xlabel("Sample Size",size=15)
plt.ylabel("mean MSE",size=15)
plt.legend(bbox_to_anchor=(1,1), loc="upper right", borderaxespad=1)
plt.savefig("datasets/adaptive_lasso/mean_coef_mse.png")
plt.show()