In [1]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder

In [2]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

feature_cols = num_cols + cat_cols
col_count = len(feature_cols)
target_col = 'Churn'
error_col = 'TotalSpent'

In [3]:
# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
data = data.dropna()

# numeric data settings
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# categorical data settings
data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
data = pd.get_dummies(data, columns=cat_cols)

# test set download
X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
median = X_submis[error_col].sum()//len(X_submis[error_col])
X_submis = X_submis.fillna(median)

X_submis[cat_cols] = X_submis[cat_cols].apply(LabelEncoder().fit_transform)
X_submis = pd.get_dummies(X_submis, columns=cat_cols)
X_submis[num_cols] = scaler.fit_transform(X_submis[num_cols])

submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [4]:
y = data[target_col]

features_list = list(data)
features_list.remove(target_col)

X = data[features_list]
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                    train_size=0.8,
                                                    random_state=42)

X_submis = X_submis.to_numpy()

In [5]:
C_l = [100, 10, 1, 0.1, 0.01, 0.001]

param_grid = {'C': [100, 10, 1, 0.1, 0.01]}

search = GridSearchCV(LogisticRegression(), 
                      param_grid,
                      cv=10,
                      refit=True)

pipe = make_pipeline(
    # OneHotEncoder(handle_unknown='ignore'),
    StandardScaler(),
    search
)
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gridsearchcv',
                 GridSearchCV(cv=10, estimator=LogisticRegression(),
                              param_grid={'C': [100, 10, 1, 0.1, 0.01]}))])

In [6]:
preds_train = pipe.predict(X_test)

In [7]:
print("Tuned Hyperparameters :", search.best_params_)
print("Accuracy :",search.best_score_)

Tuned Hyperparameters : {'C': 100}
Accuracy : 0.8044027422859138


In [8]:
preds_test = pipe.predict(X_submis)

In [9]:
new_df = pd.DataFrame(preds_test)
print(preds_test.shape)

(1761,)


In [10]:
# submission['#']

In [11]:
submission[target_col] = new_df
os.chdir('/kaggle/working/')
submission.to_csv('submission_.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
