In [1]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#for classification_report
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import precision_recall_fscore_support
from tabulate import tabulate

In [2]:
def classification_report(y_true, y_pred, labels=None, 
                          target_names=['0', '1'],
                          sample_weight=None, digits=4, tablfmt='pipe'):
    
    floatfmt = '.{:}f'.format(digits)
    if labels is None:
        labels = unique_labels(y_true, y_pred)
    else:
        labels = np.asarray(labels)

    if target_names is not None and len(labels) != len(target_names):
        print(
            "labels size, {0}, does not match size of target_names, {1}"
            .format(len(labels), len(target_names))
        )

    last_line_heading = 'avg / total'

    if target_names is None:
        target_names = [u'%s' % l for l in labels]

    headers = ["precision", "recall", "f1-score", "support"]

    p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
                                                  labels=labels,
                                                  average=None,
                                                  sample_weight=sample_weight)

    rows = zip(target_names, p, r, f1, s)
    tbl_rows = []
    for row in rows:
        tbl_rows.append(row)

    # compute averages
    last_row = (last_line_heading,
                np.average(p, weights=s),
                np.average(r, weights=s),
                np.average(f1, weights=s),
                np.sum(s))
    tbl_rows.append(last_row)
    return tabulate(tbl_rows, headers=headers,
                    tablefmt=tablfmt, floatfmt=floatfmt)

In [3]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

no_cat = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent',
    'Churn'
]

feature_cols = num_cols + cat_cols
col_count = len(feature_cols)
unique_cat = []
target_col = 'Churn'
error_col = 'TotalSpent'

In [4]:
# GET_DUMMIES VERSION

# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
data = data.dropna()

# # numeric data settings
# scaler = StandardScaler()
# data[num_cols] = scaler.fit_transform(data[num_cols])

# # categorical data settings
# data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
# data = pd.get_dummies(data, columns=cat_cols)

# # test set download
# X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
# X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
# median = X_submis[error_col].sum()//len(X_submis[error_col])
# X_submis = X_submis.fillna(median)

# X_submis[cat_cols] = X_submis[cat_cols].apply(LabelEncoder().fit_transform)
# X_submis = pd.get_dummies(X_submis, columns=cat_cols)
# X_submis[num_cols] = scaler.fit_transform(X_submis[num_cols])

# submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [5]:
y = data[target_col]

features_list = list(data)
features_list.remove(target_col)

X = data[features_list]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.9,
                                                    random_state=42)

# X_submis = X_submis.to_numpy()

In [6]:
# # DEBAG CELL

# # train set download
# data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
# data = data.dropna()

In [7]:
# data

In [8]:
# # DEBAG CELL

# data[cat_cols]

In [9]:
def get_unique_cat(df, cat_cols):
    
    global unique_cat
    unique_cat = [] #debag
    
    for col in cat_cols:
        unique_cat += [f'{col}_{item}' for item, cat in enumerate(list(df[col].unique()))]
    
    return unique_cat
# len(list(df[col].unique()))

In [10]:
def df_constructor(main_df, encode_arr, cols):
    global num_cols
    
    df_encoded = pd.DataFrame(encode_arr, columns=cols)
    
    main_df = main_df[no_cat].reset_index(drop=True)
    full_df = main_df.join(df_encoded)
    return full_df

In [11]:
# ONE_HOT_ENCODE + INDEX UPDATE VERSION

# # train set download
# data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')

# # похоже в этом столбце некорректный формат данных
# data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
# data = data.dropna()

# # numeric data settings
# scaler = StandardScaler()
# data[num_cols] = scaler.fit_transform(data[num_cols])

# # categorical data settings
# unique_cat_arr = get_unique_cat(data[cat_cols], cat_cols)
# encoder = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
# data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
# data = data.dropna()

# encoded_arr = encoder.fit_transform(data[cat_cols])
# coded_data = df_constructor(data, encoded_arr, unique_cat_arr)

# # test set download
# X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
# X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
# median = X_submis[error_col].sum()//len(X_submis[error_col])
# X_submis = X_submis.fillna(median)

# X_submis[cat_cols] = X_submis[cat_cols].apply(LabelEncoder().fit_transform)
# X_submis = pd.get_dummies(X_submis, columns=cat_cols)
# X_submis[num_cols] = scaler.fit_transform(X_submis[num_cols])

# submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [12]:
# DEBAG CELL

# coded_data

In [13]:
# y = coded_data[target_col]

# features_list = list(coded_data)
# features_list.remove(target_col)

# X = coded_data[features_list]
# X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
#                                                     train_size=0.8,
#                                                     random_state=42)

# X_submis = X_submis.to_numpy()

In [14]:
# # IT WORK BUT NOT HANDSOM

# C_l = [0.001, 0.01, 0.1, 1, 10, 30, 50, 100, 100]

# param_grid = {'C': C_l,
#              'solver':['lbfgs', 'sag', 'newton-cg','liblinear'],}

# search = GridSearchCV(LogisticRegression(max_iter=5000), 
#                       param_grid,
#                       cv=10,
#                       refit=True)

# # pipe = make_pipeline(
# #     StandardScaler(),
# #     OneHotEncoder(handle_unknown='ignore'),
# #     search
# # )
# search.fit(X_train, y_train)

In [15]:
# print("Tuned Hyperparameters :", search.best_params_)
# print("Accuracy :",search.best_score_)

In [16]:
# model = LogisticRegression(C=100,solver='liblinear', max_iter=5000).fit(X_train, y_train)

In [17]:
# preds_test = model.predict(X_test)
# print(classification_report(y_test, preds_test)) # lol/ print и вывод консольный имеет разный способ вывода. забавно

In [18]:
# preds_submis = model.predict(X_submis)

In [19]:
# new_df = pd.DataFrame(preds_submis)
# print(preds_submis.shape)

In [20]:
# submission[target_col] = new_df
# os.chdir('/kaggle/working/')
# submission.to_csv('submission_.csv', index=False)
# print("Your submission was successfully saved!")

In [21]:
# print(type(np.array([100., 10., 1., 0.1, 0.01, 0.001])))

In [22]:
# обдолбанные мысли
#модет сёрч не надо запускать для трейнинга?
#может не нужно готовить данные для сёрча, если он прогоняет их серез свои эстиматоры
#зачем тогда регрессия в сёрче
#

In [23]:
# X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)


In [24]:
# DEBAG CELL

C_l = (100, 10, 1, 0.1, 0.01, 0.001)

# param_grid = {'C': C_l}

param_grid = {
    'logreg__class_weight':[None, 'balanced'],
    'logreg__C':C_l,
}
# print(type(param_grid['logreg__C']))

num_features = num_cols

# preprocessing pipline for numeric data
# нормализация численных данных
num_transformer = Pipeline(steps=[
    ('numerical', StandardScaler())])

# кодировная каьегор жанных
categorical_features = unique_cat

# preprocessing pipline for categorical data
categorical_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoding', LabelEncoder()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

# использует трансвормеры на отдельных колонках
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', categorical_transformer, categorical_features)])


pipe =  Pipeline(steps=[('preprocessor', preprocessor),
                   ("logreg", LogisticRegression())
                   ])

pipe.fit(X_train, y_train)

# X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

# search = GridSearchCV(pipe, 
#                       param_grid,
#                       scoring='roc_auc',
#                       cv=10,
#                       refit=True)

# # search.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('numerical',
                                                                   StandardScaler())]),
                                                  ['ClientPeriod',
                                                   'MonthlySpending',
                                                   'TotalSpent']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoding',
                                                                   LabelEncoder()),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  [])])),
                ('logreg', LogisticRegression())])

In [25]:
print("model score: %.3f" % pipe.score(X_test, y_test))

model score: 0.771


In [26]:
# test set download
X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
median = X_submis[error_col].sum()//len(X_submis[error_col])
X_submis = X_submis.fillna(median)

submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [27]:
preds_train = pipe.predict(X_submis)

In [28]:
new_df = pd.DataFrame(preds_train)
print(preds_train.shape)

(1761,)


In [29]:
submission[target_col] = new_df
os.chdir('/kaggle/working/')
submission.to_csv('submission_.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [30]:
# pipe.get_params().keys()

In [31]:
# preds_train = pipe.predict(X_test)