In [1]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#for classification_report
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import precision_recall_fscore_support
from tabulate import tabulate

In [2]:
def classification_report(y_true, y_pred, labels=None, 
                          target_names=['0', '1'],
                          sample_weight=None, digits=4, tablfmt='pipe'):
    
    floatfmt = '.{:}f'.format(digits)
    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)

    if target_names is not None and len(labels) != len(target_names):
        print(
            "labels size, {0}, does not match size of target_names, {1}"
            .format(len(labels), len(target_names))
        )

    last_line_heading = 'avg / total'

    if target_names is None:
        target_names = [u'%s' % l for l in labels]

    headers = ["precision", "recall", "f1-score", "support"]

    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
                                                  average=None,
                                                  sample_weight=sample_weight)

    rows = zip(target_names, p, r, f1, s)
    tbl_rows = []
    for row in rows:
        tbl_rows.append(row)

    # compute averages
    last_row = (last_line_heading,
                np.average(p, weights=s),
                np.average(r, weights=s),
                np.average(f1, weights=s),
                np.sum(s))
    tbl_rows.append(last_row)
    return tabulate(tbl_rows, headers=headers,
                    tablefmt=tablfmt, floatfmt=floatfmt)

In [3]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

no_cat = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent',
    'Churn'
]

feature_cols = num_cols + cat_cols
col_count = len(feature_cols)
unique_cat = []
target_col = 'Churn'
error_col = 'TotalSpent'

In [4]:
# GET_DUMMIES VERSION

# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
data = data.dropna()

# numeric data settings
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# categorical data settings
data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
data = pd.get_dummies(data, columns=cat_cols)

# test set download
X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
median = X_submis[error_col].sum()//len(X_submis[error_col])
X_submis = X_submis.fillna(median)

X_submis[cat_cols] = X_submis[cat_cols].apply(LabelEncoder().fit_transform)
X_submis = pd.get_dummies(X_submis, columns=cat_cols)
X_submis[num_cols] = scaler.fit_transform(X_submis[num_cols])

submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [5]:
y = data[target_col]

features_list = list(data)
features_list.remove(target_col)

X = data[features_list]
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                    train_size=0.9,
                                                    random_state=42)

X_submis = X_submis.to_numpy()

In [6]:
def get_unique_cat(df, cat_cols):
    
    global unique_cat
    unique_cat = [] #debag
    
    for col in cat_cols:
        unique_cat += [f'{col}_{item}' for item, cat in enumerate(list(df[col].unique()))]
    
    return unique_cat

In [7]:
def df_constructor(main_df, encode_arr, cols):
    global num_cols
    
    df_encoded = pd.DataFrame(encode_arr, columns=cols)
    
    main_df = main_df[no_cat].reset_index(drop=True)
    full_df = main_df.join(df_encoded)
    return full_df

In [8]:
C_l = [0.001, 0.01, 0.1, 1, 10, 30, 50, 100, 100]

param_grid = {'C': C_l,
             'solver':['lbfgs', 'sag', 'newton-cg','liblinear'],}

search = GridSearchCV(LogisticRegression(max_iter=5000), 
                      param_grid,
                      cv=10,
                      refit=True)

search.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=LogisticRegression(max_iter=5000),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 30, 50, 100, 100],
                         'solver': ['lbfgs', 'sag', 'newton-cg', 'liblinear']})

In [9]:
print("Tuned Hyperparameters :", search.best_params_)
print("Accuracy :",search.best_score_)

Tuned Hyperparameters : {'C': 30, 'solver': 'liblinear'}
Accuracy : 0.8073786364645791


In [10]:
model = LogisticRegression(C=100,solver='liblinear', max_iter=5000).fit(X_train, y_train)

In [11]:
preds_test = model.predict(X_test)
print(classification_report(y_test, preds_test)) # lol/ print и вывод консольный имеет разный способ вывода. забавно

|             |   precision |   recall |   f1-score |   support |
|:------------|------------:|---------:|-----------:|----------:|
| 0           |      0.8392 |   0.8875 |     0.8627 |  400.0000 |
| 1           |      0.5714 |   0.4688 |     0.5150 |  128.0000 |
| avg / total |      0.7743 |   0.7860 |     0.7784 |  528.0000 |


In [12]:
preds_submis = model.predict(X_submis)

In [13]:
new_df = pd.DataFrame(preds_submis)
print(preds_submis.shape)

(1761,)


In [14]:
submission[target_col] = new_df
os.chdir('/kaggle/working/')
submission.to_csv('submission_.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [15]:
# обдолбанные мысли
#модет сёрч не надо запускать для трейнинга?
#может не нужно готовить данные для сёрча, если он прогоняет их серез свои эстиматоры
#зачем тогда регрессия в сёрче
#