In [1]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#for classification_report
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import precision_recall_fscore_support
from tabulate import tabulate

In [2]:
def classification_report(y_true, y_pred, labels=None, 
                          target_names=['0', '1'],
                          sample_weight=None, digits=4, tablfmt='pipe'):
    
    floatfmt = '.{:}f'.format(digits)
    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)

    if target_names is not None and len(labels) != len(target_names):
        print(
            "labels size, {0}, does not match size of target_names, {1}"
            .format(len(labels), len(target_names))
        )

    last_line_heading = 'avg / total'

    if target_names is None:
        target_names = [u'%s' % l for l in labels]

    headers = ["precision", "recall", "f1-score", "support"]

    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
                                                  average=None,
                                                  sample_weight=sample_weight)

    rows = zip(target_names, p, r, f1, s)
    tbl_rows = []
    for row in rows:
        tbl_rows.append(row)

    # compute averages
    last_row = (last_line_heading,
                np.average(p, weights=s),
                np.average(r, weights=s),
                np.average(f1, weights=s),
                np.sum(s))
    tbl_rows.append(last_row)
    return tabulate(tbl_rows, headers=headers,
                    tablefmt=tablfmt, floatfmt=floatfmt)

In [3]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

no_cat = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent',
    'Churn'
]

feature_cols = num_cols + cat_cols
col_count = len(feature_cols)
unique_cat = []
target_col = 'Churn'
error_col = 'TotalSpent'

In [4]:
def get_unique_cat(df, cat_cols):
    
    global unique_cat
    unique_cat = [] #debag
    
    for col in cat_cols:
        unique_cat += [f'{col}_{item}' for item, cat in enumerate(list(df[col].unique()))]
    
    return unique_cat
# len(list(df[col].unique()))

In [5]:
def df_constructor(main_df, encode_arr, cols):
    global num_cols
    
    df_encoded = pd.DataFrame(encode_arr, columns=cols)
    
    main_df = main_df[no_cat].reset_index(drop=True)
    full_df = main_df.join(df_encoded)
    return full_df

In [6]:
# DATA IMPORT

data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')

In [7]:
# DATASET SPLIT

y = data[target_col]

features_list = list(data)
features_list.remove(target_col)

X = data[features_list]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.9,
                                                    random_state=42)

# X_submis = X_submis.to_numpy()

In [8]:
# DATA ENCODING

C_l = [100, 10, 1, 0.1, 0.01, 0.001]

param_grid = {
    'logreg__class_weight':[None, 'balanced'],
    'logreg__C':C_l,
    'preprocessor__num__imputer__strategy':['mean','median']
#     'logreg__C': C_l,
#     'solver':['lbfgs', 'sag', 'newton-cg','liblinear'],
}

num_features = num_cols

# preprocessing pipline for numeric data
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('numerical', StandardScaler())])

categorical_features = unique_cat

# preprocessing pipline for categorical data
categorical_transformer = Pipeline(steps=[
    ('encoding', LabelEncoder()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# использует трансвормеры на отдельных колонках
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', categorical_transformer, categorical_features)])


pipe =  Pipeline(steps=[('preprocessor', preprocessor),
                   ("logreg", LogisticRegression())
                   ])

# pipe.fit(X_train, y_train)

search = GridSearchCV(pipe, 
                      param_grid,
                      scoring='roc_auc',
                      cv=10,
                      refit=True)

search.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('numerical',
                                                                                          StandardScaler())]),
                                                                         ['ClientPeriod',
                                                                          'MonthlySpending',
                                                                          'TotalSpent']),
                                                                        ('cat',
                                                   

In [9]:
pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'logreg', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__num', 'preprocessor__cat', 'preprocessor__num__memory', 'preprocessor__num__steps', 'preprocessor__num__verbose', 'preprocessor__num__imputer', 'preprocessor__num__numerical', 'preprocessor__num__imputer__add_indicator', 'preprocessor__num__imputer__copy', 'preprocessor__num__imputer__fill_value', 'preprocessor__num__imputer__missing_values', 'preprocessor__num__imputer__strategy', 'preprocessor__num__imputer__verbose', 'preprocessor__num__numerical__copy', 'preprocessor__num__numerical__with_mean', 'preprocessor__num__numerical__with_std', 'preprocessor__cat__memory', 'preprocessor__cat__steps', 'preprocessor__cat__verbose', 'preprocessor__cat__encoding', 'preprocessor__cat__onehot', 'pre

In [10]:
print("Tuned Hyperparameters :", search.best_params_)
print("Accuracy :",search.best_score_)

Tuned Hyperparameters : {'logreg__C': 100, 'logreg__class_weight': 'balanced', 'preprocessor__num__imputer__strategy': 'median'}
Accuracy : 0.8095276373232503


In [11]:
print("model score: %.3f" % search.score(X_test, y_test))

model score: 0.747


In [12]:
# test set download
X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
median = X_submis[error_col].sum()//len(X_submis[error_col])
X_submis = X_submis.fillna(median)

submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [13]:
preds_train = search.predict(X_submis)

In [14]:
new_df = pd.DataFrame(preds_train)
print(preds_train.shape)

(1761,)


In [15]:
submission[target_col] = new_df
os.chdir('/kaggle/working/')
submission.to_csv('submission_.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
