In [10]:
import pandas as pd
import numpy as np
import sklearn
import collections


from sklearn.linear_model import Lasso, LassoCV, Ridge, RidgeCV
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# read train,val and test from files

In [3]:
X_train = pd.read_csv('input/train.tsv', sep='\t')
X_test = pd.read_csv('input/test.tsv', sep='\t')

In [4]:
y_train = X_train['target'].copy()
X_train.drop(['target', 'user_id'], axis=1, inplace=True)

y_test = X_test['target'].copy()
X_test.drop(['target', 'user_id'], axis=1, inplace=True)

In [7]:
number_of_folds = 10
kfold = StratifiedKFold(n_splits=number_of_folds, shuffle=True)

In [15]:
lasso_dict = collections.defaultdict(lambda: 0.0)
for train_index , test_index in kfold.split(X_train,y_train):
    trainX , valX = X_train.iloc[train_index,:],X_train.iloc[test_index,:]
    trainY , valY = y_train[train_index] , y_train[test_index]
    lasso_dict = Lasso_finetuning(trainX,trainY,valX,valY,lasso_dict)    

In [25]:
for alpha in lasso_dict:
    lasso_dict[alpha] /= number_of_folds

lasso_dict = {k: v for k, v in sorted(lasso_dict.items(), key=lambda item: item[1])}
alpha = next(iter(lasso_dict))

In [26]:
print(alpha)

0.0001


implement lasso for the best alpha (0.0001)

In [30]:
features = []
lasso = Lasso(max_iter=10000, alpha=alpha)
lasso.fit(X_train, y_train)
ser = pd.Series(lasso.coef_, index=X_train.columns)
for i, v in ser.items():
        if (abs(v) != 0):
            features.append(i)

our new **X_train** and **X_test** after lasso feature selection

In [32]:
X_train = X_train[features]
X_test = X_test[features]

**lasso_dict** contains all pairs of "mse" : "features with zero corelation"

In [75]:
lasso_dict

{0.1114311972973531: [],
 0.11256462212376686: ['graph_dim_4', 'graph_dim_95', 'graph_dim_117'],
 0.11157049853712421: ['graph_dim_54'],
 0.11019620817951874: ['graph_dim_54', 'graph_dim_68'],
 0.10827877599015043: ['graph_dim_4', 'graph_dim_75'],
 0.11527118462141611: ['graph_dim_4',
  'graph_dim_9',
  'graph_dim_68',
  'graph_dim_75',
  'graph_dim_117'],
 0.11325168913993625: ['graph_dim_9', 'graph_dim_75', 'graph_dim_117'],
 0.11005738005577191: ['graph_dim_4', 'graph_dim_68', 'graph_dim_92'],
 0.11313180816450306: ['graph_dim_75',
  'graph_dim_92',
  'graph_dim_95',
  'graph_dim_117'],
 0.11353996158652488: ['graph_dim_9']}

**Lasso_finetuning** : returns a pair of mse of train dataset and zero corelation features

In [14]:
def Lasso_finetuning(X_train, Y_train,X_test,Y_test,lasso_dict):
    alpha = np.arange(0.0001, 0.1, 0.001)
    
    for a_param in alpha:
        lasso = Lasso(max_iter=10000, alpha=a_param)
        lasso.fit(X_train,Y_train)
        mse = mean_squared_error(Y_test, lasso.predict(X_test))
        lasso_dict[a_param] += mse
        
    return lasso_dict