<a href="https://colab.research.google.com/github/Mahnazshamissa/Python/blob/main/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, f1_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, average_precision_score, plot_precision_recall_curve
from sklearn.model_selection import train_test_split

In [2]:
abo_path = '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv'
p_df_raw = pd.read_csv(abo_path, sep=',', encoding="UTF-8")

In [3]:
p_df_raw.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [4]:
p_df_raw.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

# **Transform the data: make every column into number and scale the values of each column**

In [5]:
df_y = p_df_raw['Churn']
df_y = pd.DataFrame(LabelEncoder().fit_transform(df_y))
df_y = df_y.reset_index(drop=True)

df_X_raw = p_df_raw.drop(['customerID', 'Churn'], axis=1)
#df_X_raw = p_df_raw_train.drop(['Name'], axis=1)

columns=df_X_raw.columns
for f in df_X_raw.columns:
    if df_X_raw[f].dtype == 'object':
        df_X_raw[f] = LabelEncoder().fit_transform(list(df_X_raw[f]))

In [6]:
scaler = PowerTransformer()
scaled_df = scaler.fit_transform(df_X_raw)
df_X = pd.DataFrame(scaled_df, columns=columns)
df_X = df_X.reset_index(drop=True)

In [7]:
class_labels = np.unique(df_y)
class_weights = compute_class_weight('balanced', class_labels, df_y)
class_weights = dict(zip(class_labels, class_weights))

  y = column_or_1d(y, warn=True)


In [8]:
class_weights

{0: 0.6806146115191342, 1: 1.8841626538255751}

In [9]:
df_X.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y)

In [11]:
categorical_feature=['gender', 'SeniorCitizen', 'Partner', 'Dependent',
                     'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'DeviceProtection', 'TechSupport',
                     'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod' ]

train_data = lgb.Dataset(X_train, label=y_train) #categorical_feature=categorical_feature)

# **LightGM parameters**

In [12]:
params = {}
#params['max_bin'] = 10
params['learning_rate'] = 0.1 # shrinkage_rate
params['metric'] = 'auc'          # or 'mae'
params['sub_feature'] = 0.50      # feature_fraction 
params['bagging_fraction'] = 0.85 # sub_row
params['bagging_freq'] = 40
params['verbose'] = 0

# **training with CV**

In [None]:
eval_hist = lgb.cv(
    params = params, train_set = train_data,
    num_boost_round=100, 
    nfold=3,
    stratified=True,
    shuffle=True,
    #metrics='auc', 
    fobj=None, ### explore later in detail
    feval=None, ### explore later in detail
    init_model=None,
    feature_name='auto',
    #early_stopping_rounds=5,
    fpreproc=None,
    verbose_eval=20,
    show_stdv=True,
    seed=0,
    callbacks=None, ### explore later in detail
    eval_train_metric=False,
    return_cvbooster=False
)

In [None]:
# Display results
print('Current parameters:\n', params)
print('\nBest num_boost_round:', len(eval_hist['auc-mean']))
print('Best CV score:', eval_hist['auc-mean'][-1])

In [None]:
bst = lgb.train(params, train_data, 100)

# **Predicting on test data**

In [None]:
clf_test_pred = bst.predict(X_test)

# **convert prob to binary classes**

In [None]:
clf_test_pred_class = np.where(clf_test_pred > 0.5, 1, 0)

In [None]:
auc_test = roc_auc_score(y_test, clf_test_pred_class)
test_f1 = round(f1_score(y_test, clf_test_pred_class), 8)*100
print('\nAUROC:',auc_test, ", F1 score:", test_f1)

In [None]:
fpr, tpr, threshold = roc_curve(y_test, clf_test_pred_class)

In [None]:
fig = plt.figure(figsize=(12,8)) 
ax = fig.add_subplot(1,1,1)
plt.plot([0,1],[0,1], 'k--')
plt.plot(fpr, tpr);

In [None]:
print('Accuracy')
print(accuracy_score(y_test, clf_test_pred_class))
print('Confusion Matrix')
print(confusion_matrix(y_test, clf_test_pred_class))
print('Confusion Report')
print(classification_report(y_test, clf_test_pred_class))

# **Rpeat the above stats for test data**

# **saving/loading the model**

In [None]:
joblib.dump(clf, 'Lgbm_F1_.pkl')

In [None]:
clf_pickle = joblib.load('Lgbm_F1.pkl')