In [3]:
import os
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import OneHotEncoder

In [4]:
# Числовые признаки
num_cols = [
    'ClientPeriod',
    'MonthlySpending',
    'TotalSpent'
]

# Категориальные признаки
cat_cols = [
    'Sex',
    'IsSeniorCitizen',
    'HasPartner',
    'HasChild',
    'HasPhoneService',
    'HasMultiplePhoneNumbers',
    'HasInternetService',
    'HasOnlineSecurityService',
    'HasOnlineBackup',
    'HasDeviceProtection',
    'HasTechSupportAccess',
    'HasOnlineTV',
    'HasMovieSubscription',
    'HasContractPhone',
    'IsBillingPaperless',
    'PaymentMethod'
]

feature_cols = num_cols + cat_cols
col_count = len(feature_cols)
target_col = 'Churn'
error_col = 'TotalSpent'

In [26]:
# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
data = data.dropna()

# numeric data settings
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# categorical data settings
data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
data = pd.get_dummies(data, columns=cat_cols)

# test set download
X_submis = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/test.csv')
X_submis[error_col] =  pd.to_numeric(X_submis[error_col], errors='coerce')
median = X_submis[error_col].sum()//len(X_submis[error_col])
X_submis = X_submis.fillna(median)

X_submis[cat_cols] = X_submis[cat_cols].apply(LabelEncoder().fit_transform)
X_submis = pd.get_dummies(X_submis, columns=cat_cols)
X_submis[num_cols] = scaler.fit_transform(X_submis[num_cols])

submission = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/submission.csv')

In [27]:
data

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Churn,Sex_0,Sex_1,IsSeniorCitizen_0,IsSeniorCitizen_1,HasPartner_0,HasPartner_1,...,HasMovieSubscription_2,HasContractPhone_0,HasContractPhone_1,HasContractPhone_2,IsBillingPaperless_0,IsBillingPaperless_1,PaymentMethod_0,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3
0,0.919099,-1.506436,-0.557582,0,0,1,1,0,0,1,...,0,0,1,0,1,0,0,0,0,1
1,1.612060,-1.295997,-0.184763,0,0,1,1,0,0,1,...,0,0,0,1,1,0,0,1,0,0
2,-1.282072,0.362658,-0.976504,1,0,1,1,0,1,0,...,0,1,0,0,0,1,0,0,1,0
3,-0.018437,0.475334,0.122800,0,1,0,0,1,0,1,...,0,1,0,0,1,0,0,0,0,1
4,1.122911,1.666716,1.968909,0,1,0,1,0,0,1,...,1,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,-1.200547,-1.145210,-0.973749,0,0,1,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
5278,0.715287,-0.679594,-0.045372,0,0,1,1,0,0,1,...,0,0,1,0,0,1,1,0,0,0
5279,-1.282072,-0.328310,-0.985693,0,0,1,1,0,1,0,...,0,1,0,0,1,0,0,0,0,1
5280,-0.140724,0.365972,-0.033560,0,1,0,1,0,1,0,...,0,1,0,0,0,1,0,1,0,0


In [63]:
# DEBAG CELL

# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

In [64]:
# DEBAG CELL

data[cat_cols]

Unnamed: 0,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod
0,Male,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check
1,Male,0,Yes,No,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic)
2,Male,0,No,No,Yes,No,Fiber optic,No,No,No,Yes,No,No,Month-to-month,Yes,Electronic check
3,Female,1,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Mailed check
4,Female,0,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,Male,0,No,No,No,No phone service,DSL,No,No,No,Yes,No,No,Month-to-month,No,Electronic check
5278,Male,0,Yes,No,No,No phone service,DSL,Yes,No,No,Yes,Yes,No,One year,Yes,Bank transfer (automatic)
5279,Male,0,No,No,Yes,No,DSL,No,No,Yes,Yes,No,No,Month-to-month,No,Mailed check
5280,Female,0,No,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic)


In [42]:
def get_unique_cat(df, cat_cols):
    
    unique_cat = []
    for col in cat_cols:
        unique_cat += [f'{col}_{item}' for item, cat in enumerate(list(df[col].unique()))]
    
    return unique_cat
# len(list(df[col].unique()))

In [56]:
def df_constructor(main_df, encode_arr, cols):
    global num_cols
    df_encoded = pd.DataFrame(encode_arr, columns=cols)
    
    full_df = main_df[num_cols].join(df_encoded)
    return full_df

In [65]:
# DEBAG CELL

# train set download
data = pd.read_csv('/kaggle/input/advanced-dls-spring-2021/train.csv')
data = data.dropna()

# похоже в этом столбце некорректный формат данных
data[error_col] =  pd.to_numeric(data[error_col], errors='coerce')
data = data.dropna()

# numeric data settings
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# categorical data settings
unique_cat_arr = get_unique_cat(data[cat_cols], cat_cols)
encoder = OneHotEncoder(sparse=False, handle_unknown = 'ignore')
data[cat_cols] = data[cat_cols].apply(LabelEncoder().fit_transform)
data


Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,0.919099,-1.506436,-0.557582,1,0,1,1,1,0,2,1,1,1,1,1,1,1,0,3,0
1,1.612060,-1.295997,-0.184763,1,0,1,0,1,2,2,1,1,1,1,1,1,2,0,1,0
2,-1.282072,0.362658,-0.976504,1,0,0,0,1,0,1,0,0,0,2,0,0,0,1,2,1
3,-0.018437,0.475334,0.122800,0,1,1,0,1,2,1,0,0,2,0,0,0,0,0,3,0
4,1.122911,1.666716,1.968909,0,0,1,1,1,2,1,2,2,2,2,2,2,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,-1.200547,-1.145210,-0.973749,1,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0
5278,0.715287,-0.679594,-0.045372,1,0,1,0,0,1,0,2,0,0,2,2,0,1,1,0,0
5279,-1.282072,-0.328310,-0.985693,1,0,0,0,1,0,0,0,0,2,2,0,0,0,0,3,0
5280,-0.140724,0.365972,-0.033560,0,0,0,0,1,2,1,0,0,0,0,0,0,0,1,1,0


In [68]:
encoded_arr = encoder.fit_transform(data[cat_cols])
encoded_arr.shape

(43,)

In [58]:
# DEBAG CELL

temp = df_constructor(data, encoded_arr, unique_cat_arr)

In [59]:
temp.columns

Index(['ClientPeriod', 'MonthlySpending', 'TotalSpent', 'Sex_0', 'Sex_1',
       'IsSeniorCitizen_0', 'IsSeniorCitizen_1', 'HasPartner_0',
       'HasPartner_1', 'HasChild_0', 'HasChild_1', 'HasPhoneService_0',
       'HasPhoneService_1', 'HasMultiplePhoneNumbers_0',
       'HasMultiplePhoneNumbers_1', 'HasMultiplePhoneNumbers_2',
       'HasInternetService_0', 'HasInternetService_1', 'HasInternetService_2',
       'HasOnlineSecurityService_0', 'HasOnlineSecurityService_1',
       'HasOnlineSecurityService_2', 'HasOnlineBackup_0', 'HasOnlineBackup_1',
       'HasOnlineBackup_2', 'HasDeviceProtection_0', 'HasDeviceProtection_1',
       'HasDeviceProtection_2', 'HasTechSupportAccess_0',
       'HasTechSupportAccess_1', 'HasTechSupportAccess_2', 'HasOnlineTV_0',
       'HasOnlineTV_1', 'HasOnlineTV_2', 'HasMovieSubscription_0',
       'HasMovieSubscription_1', 'HasMovieSubscription_2',
       'HasContractPhone_0', 'HasContractPhone_1', 'HasContractPhone_2',
       'IsBillingPaperless_0', 

In [60]:
temp

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex_0,Sex_1,IsSeniorCitizen_0,IsSeniorCitizen_1,HasPartner_0,HasPartner_1,HasChild_0,...,HasMovieSubscription_2,HasContractPhone_0,HasContractPhone_1,HasContractPhone_2,IsBillingPaperless_0,IsBillingPaperless_1,PaymentMethod_0,PaymentMethod_1,PaymentMethod_2,PaymentMethod_3
0,0.919099,-1.506436,-0.557582,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.612060,-1.295997,-0.184763,0.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,-1.282072,0.362658,-0.976504,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.018437,0.475334,0.122800,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1.122911,1.666716,1.968909,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,-1.200547,-1.145210,-0.973749,,,,,,,,...,,,,,,,,,,
5278,0.715287,-0.679594,-0.045372,,,,,,,,...,,,,,,,,,,
5279,-1.282072,-0.328310,-0.985693,,,,,,,,...,,,,,,,,,,
5280,-0.140724,0.365972,-0.033560,,,,,,,,...,,,,,,,,,,


In [25]:
# DEBAG CELL

data

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,Sex,IsSeniorCitizen,HasPartner,HasChild,HasPhoneService,HasMultiplePhoneNumbers,HasInternetService,HasOnlineSecurityService,HasOnlineBackup,HasDeviceProtection,HasTechSupportAccess,HasOnlineTV,HasMovieSubscription,HasContractPhone,IsBillingPaperless,PaymentMethod,Churn
0,0.919099,-1.506436,-0.557582,1,0,1,1,1,0,2,1,1,1,1,1,1,1,0,3,0
1,1.612060,-1.295997,-0.184763,1,0,1,0,1,2,2,1,1,1,1,1,1,2,0,1,0
2,-1.282072,0.362658,-0.976504,1,0,0,0,1,0,1,0,0,0,2,0,0,0,1,2,1
3,-0.018437,0.475334,0.122800,0,1,1,0,1,2,1,0,0,2,0,0,0,0,0,3,0
4,1.122911,1.666716,1.968909,0,0,1,1,1,2,1,2,2,2,2,2,2,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,-1.200547,-1.145210,-0.973749,1,0,0,0,0,1,0,0,0,0,2,0,0,0,0,2,0
5278,0.715287,-0.679594,-0.045372,1,0,1,0,0,1,0,2,0,0,2,2,0,1,1,0,0
5279,-1.282072,-0.328310,-0.985693,1,0,0,0,1,0,0,0,0,2,2,0,0,0,0,3,0
5280,-0.140724,0.365972,-0.033560,0,0,0,0,1,2,1,0,0,0,0,0,0,0,1,1,0


In [None]:
y = data[target_col]

features_list = list(data)
features_list.remove(target_col)

X = data[features_list]
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, 
                                                    train_size=0.8,
                                                    random_state=42)

X_submis = X_submis.to_numpy()

In [None]:
C_l = [100, 10, 1, 0.1, 0.01, 0.001]

param_grid = {'C': [100, 10, 1, 0.1, 0.01]}

search = GridSearchCV(LogisticRegression(), 
                      param_grid,
                      cv=10,
                      refit=True)

pipe = make_pipeline(
    # OneHotEncoder(handle_unknown='ignore'),
    StandardScaler(),
    search
)
pipe.fit(X_train, y_train)

In [None]:
preds_train = pipe.predict(X_test)

In [None]:
print("Tuned Hyperparameters :", search.best_params_)
print("Accuracy :",search.best_score_)

In [None]:
preds_test = pipe.predict(X_submis)

In [None]:
new_df = pd.DataFrame(preds_test)
print(preds_test.shape)

In [None]:
# submission['#']

In [None]:
submission[target_col] = new_df
os.chdir('/kaggle/working/')
submission.to_csv('submission_.csv', index=False)
print("Your submission was successfully saved!")