# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from matplotlib import ticker
import seaborn as sns
import warnings
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.metrics import brier_score_loss
from lightgbm import LGBMClassifier


# options
warnings.filterwarnings('ignore')

# Data

In [2]:
train = pd.read_csv('../input/devday22-competition-datascience/train.csv')
test = pd.read_csv('../input/devday22-competition-datascience/test.csv')
submission = pd.read_csv('../input/devday22-competition-datascience/sample_submission.csv')

# Exploratory Data Analysis (Basic EDA)

In [None]:
train.head()

Exploring data

In [None]:
#train['sequence1'].value_counts()
train['sequence2'].value_counts()
train['sequence3'].value_counts()
train['mutation'].value_counts()
#ATGCGCTTAATT
#GATCGTAGCTAA

from .info() we can see that there are no null values

In [None]:
train.info()

mutation is a binary target

In [None]:
train['mutation'].value_counts()

In [None]:
train.shape
train.describe()

In [None]:
test.head()

In [None]:
submission.head()


* training data continous features distribution.

In [None]:
plt.rcParams['figure.dpi'] = 600
fig = plt.figure(figsize=(10, 10), facecolor='#f6f5f5')
gs = fig.add_gridspec(7, 3)
gs.update(wspace=0.3, hspace=0.3)
background_color = "#dcdada"

run_no = 0
for row in range(0, 7):
    for col in range(0, 3):
        locals()["ax"+str(run_no)] = fig.add_subplot(gs[row, col])
        locals()["ax"+str(run_no)].set_facecolor(background_color)
        for s in ["top","right"]:
            locals()["ax"+str(run_no)].spines[s].set_visible(False)
        run_no += 1  

features_num = list([col for col in train.select_dtypes(exclude=object).columns if col not in ['ID', 'mutation']])

run_no = 0
for col in features_num:
    sns.kdeplot(ax=locals()["ax"+str(run_no)], x=train[col], zorder=2, alpha=1, linewidth=1, color='#FF355D')
    locals()["ax"+str(run_no)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()["ax"+str(run_no)].set_ylabel('')
    locals()["ax"+str(run_no)].set_xlabel(col, fontsize=4, fontweight='bold')
    locals()["ax"+str(run_no)].tick_params(labelsize=4, width=0.5)
    locals()["ax"+str(run_no)].xaxis.offsetText.set_fontsize(4)
    locals()["ax"+str(run_no)].yaxis.offsetText.set_fontsize(4)
    run_no += 1

plt.show()

# checking correlations

In [None]:
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(train.corr(), annot = True,linewidths=.2, fmt='.1f', ax=ax)

training data target label distribution.

In [None]:
plt.rcParams['figure.dpi'] = 100
fig = plt.figure(figsize=(5, 5), facecolor='#f6f5f5')
background_color = "#dcdada"
fig = sns.catplot(x="mutation",data=train, kind="count")

# Training

In [3]:
def encoder(x_train, x_test):
    le = LabelEncoder()
    r = le.fit_transform(x_train)
    r2 = le.transform(x_test)
    return r, r2

In [4]:
features_cat = [col for col in train.columns if col in train.select_dtypes(include=object).columns]
features = [col for col in train.columns if col not in ['ID', 'mutation']]
print(f'total features: {len(features)}')
print(features)

In [6]:
X, y = train.loc[:, features], train.loc[:, 'mutation']
print(X.shape)
print(y.shape)

In [7]:
# encoding categorical features
for col in features_cat:
    X.loc[:, col], test.loc[:, col] = encoder(X.loc[:, col], test.loc[:, col])

In [9]:
X.head()
X.info()

In [10]:
# initializing classifier
#clf = GaussianNB()
#clf=LogisticRegression()
#clf= XGBClassifier()
clf= LGBMClassifier()

# training
clf.fit(X, y)



# Submission and Prediction

* **predict_proba** is used instead of the typical **predict** method, this is due to the evaluation metric being the *area under the Roc curve*.

In [11]:
#FINAL PREDICTION
pred = clf.predict_proba(test[features])
pred = pred[:, 1]
pred


checking accuracy using log loss and brier metrics:

In [12]:
#for understanding splitting training data
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)
clf.fit(train_X, train_y)


In [13]:
#for undertsanding purpose
pred1 = clf.predict_proba(test_X)
pred1

In [14]:
#compared log_loss and brier values for different models
loss= log_loss(test_y,pred1)
brier= brier_score_loss(test_y, pred1[:,1])

print(loss)
print(brier)
#gb_brier: 0.172
#gb_loss:1.335
#lr_brier: 0.130
#lr_loss: 0.419
#xgbc_brier: 0.112
#xgbc_loss: 0.3606
#rfc_loss: 0.380
#rfc_brier: 0.113
#lgbm_brier: 0.110
#lgbm_loss: 0.35
#after scaling
# 0.34996780340983297
# 0.10852253815587647


* for the conversion of dataframe to csv, it is necessary to specify **index=False**, if not specified, the submission will fail. 
* The following format is **absolutely necessary.**

In [15]:
submission['mutation'] = pred
submission.to_csv('submission.csv', index=False)

In [16]:
submission.head()