In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline

## Load data and create dataframes

In [2]:
root_df = pd.read_csv('../input/train.csv')
X = root_df.drop(['id', 'target'], axis=1)
y = root_df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Data is imbalanced!

So lets use the 'roc_auc' metric to account for this


In [3]:
y.value_counts()

1.0    160
0.0     90
Name: target, dtype: int64

## Build a pipeline / classifier

In [4]:
pipe = Pipeline(steps=[
    ('pre', None),
    ('clf', LogisticRegression(solver='liblinear')),
    ]
)

params = {  
    'pre': [
        None, StandardScaler(), MinMaxScaler(),
    ],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [000.1, 00.1, 0.1, 1, 10], 
    'clf__class_weight': [None, 'balanced']
}
clf = GridSearchCV(pipe, param_grid=params, scoring='roc_auc', cv=8, n_jobs=-1)

## Train and score classifier

In [5]:
clf.fit(X_train, y_train)

GridSearchCV(cv=8, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pre', None), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'pre': [None, StandardScaler(copy=True, with_mean=True, with_std=True), MinMaxScaler(copy=True, feature_range=(0, 1))], 'clf__penalty': ['l1', 'l2'], 'clf__C': [0.1, 0.1, 0.1, 1, 10], 'clf__class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [6]:
clf.best_estimator_

Pipeline(memory=None,
     steps=[('pre', None), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [7]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.7572062084257206

## Export Data

In [8]:
test_df = pd.read_csv('../input/test.csv')
X = test_df.drop(['id'], axis=1)

In [9]:
predictions = clf.predict_proba(X)[:,1]
submission = {
    "id": test_df['id'],
    "target": predictions
}
submission = pd.DataFrame(submission)

In [10]:
submission.to_csv('submission.csv', index=False)