<a href="https://colab.research.google.com/github/Hani1-2/DeepLearningAssignmnt/blob/master/alpha404_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Statement 

A mutation is a change in a small section of a genome's nucleotide sequence. Many mutations are point mutations, in which one nucleotide is replaced by another, while others involve the insertion or deletion of one or more nucleotides. Mutations are caused by errors in DNA replication or the adverse impacts of mutagens, such as chemicals and radiation, which react with DNA and alter the structures of individual nucleotides.
Your task is to predict the probability of a genome sequence undergoing mutation, given certain environmental conditions, and a genome sequence divided into 9 sub-sequences.

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from matplotlib import ticker
import seaborn as sns
import warnings

# options
warnings.filterwarnings('ignore')

# Loading the Dataset

In [None]:
train = pd.read_csv('../input/devday22-competition-datascience/train.csv')
test = pd.read_csv('../input/devday22-competition-datascience/test.csv')
submission = pd.read_csv('../input/devday22-competition-datascience/sample_submission.csv')

# Exploratory Data Analysis

In [None]:
train.head()

In [None]:
train.columns

## Columns
* ID - A unique ID column specifying a sample.
* sequence1 - sequence9 - sub-sequences of a larger genome sequence.
* A - U - Environmental values depicting various measurements within certain conditions.
* mutation - Binary target label.

In [None]:
print(f"""
Training Data
* {train.shape[0]} Rows 
* {train.shape[1]} Columns 
""")

print(f"""
Testing Data
* {test.shape[0]} Rows 
* {test.shape[1]} Columns 
""")

In [None]:
# Deleting ID Column as it is not going to be used in training
train = train.drop(columns=['ID'])
test = test.drop(columns=['ID'])

In [None]:
train.describe()

In [None]:
train.describe(include= "object")  # checking description including object 

In [None]:
# Numerical Features

num_features = [columns_name for columns_name in train.columns if train[columns_name].dtype != "O"]
print("Here is the Listed Numerical features {} ".format(num_features))

In [None]:
# Categorical Features

cat_features = [columns_name for columns_name in train.columns if train[columns_name].dtype == "O"]
print("Here is the Listed Categorical features {} ".format(cat_features))

In [None]:
print('There are {} categorical variables\n'.format(len(cat_features)))
print('There are {} numerical variables\n'.format(len(num_features)))

In [None]:
# check for cardinality in categorical variables

for i  in cat_features:
  count = len(train[i].unique())
  print("The Cardinality of column {} is : {} ".format(i,count))

In [None]:
# check missing values in categorical variables
train[cat_features].isnull().sum()

In [None]:
plt.figure(figsize=(15,10))
c = 1
for i in cat_features:
    sns.countplot(train[i])
    plt.subplot(3, 3, c)
    c += 1

In [None]:
sns.countplot(train['sequence1'])
train['sequence4'].value_counts()

In [None]:
# Explore outliers in numerical variables

plt.figure(figsize=(15,10))
c = 1
for i in num_features:
  plt.subplot(3, 8, c)
  fig = train.boxplot(column=i)
  fig.set_title('')
  fig.set_ylabel(i)
  c += 1

In [None]:
features_with_outliers = num_features
for feature in features_with_outliers:
  q1 = train[feature].quantile(0.25)
  q3 = train[feature].quantile(0.75)
  IQR = q3-q1
  lower_limit = q1 - (IQR*1.5)
  upper_limit = q3 + (IQR*1.5)
  train.loc[train[feature]<lower_limit, feature] = lower_limit
  train.loc[train[feature]>upper_limit, feature] = upper_limit

In [None]:
#Heat Map

plt.figure(figsize=(16,12))
plt.title('Correlation Heatmap')
ax = sns.heatmap(train.corr(), square=True, annot=True, fmt='.2f', linecolor='white')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), rotation=30)           
plt.show()

In [None]:
# Deleting E Column as it is not an actual feature

train = train.drop(columns=['E'])
test = test.drop(columns=['E'])


In [None]:
train.head()

## Data Distribution

In [None]:
plt.rcParams['figure.dpi'] = 100
fig = plt.figure(figsize=(5, 5), facecolor='#f6f5f5')
background_color = "#dcdada"
fig = sns.catplot(x="mutation",data=train, kind="count")

In [None]:
features_cat = [col for col in train.columns if col in train.select_dtypes(include=object).columns]
features = [col for col in train.columns if col not in ['mutation']]
print(f'total features: {len(features)}')
print(features)

In [None]:
# Features and Target Variable
X, y = train.loc[:, features], train.loc[:, 'mutation']
print(X.shape)
print(y.shape)

In [None]:
def encoder(x_train, x_test):
    le = LabelEncoder()
    r = le.fit_transform(x_train)
    r2 = le.transform(x_test)
    return r, r2

In [None]:
# encoding categorical features
for col in features_cat:
    X.loc[:, col], test.loc[:, col] = encoder(X.loc[:, col], test.loc[:, col])

In [None]:
# OverSampling
from imblearn.over_sampling import SMOTE
smote=SMOTE()
X,y=smote.fit_resample(X,y)

print(X.shape,y.shape)

## Splitting Data into Training and Validation

In [None]:
from sklearn.model_selection import train_test_split   # testing and splitting 
x_train, x_test, y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 0 )

In [None]:
print("Length of Training Data: {}".format(len(x_train)))
print("Length of Testing Data: {}".format(len(x_test)))

## Model Fitting

In [None]:
import xgboost as xgb


model = xgb.XGBClassifier(
 learning_rate =0.01,
 n_estimators=2000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

model.fit(x_train, y_train, eval_metric='auc')

## Model Testing

In [None]:
y_pred = model.predict(x_test)
y_pred

## Model Accuracy

In [None]:
from sklearn.metrics import accuracy_score
print("Accuracy Score: {}".format(accuracy_score(y_test,y_pred)))

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, x_test,y_test)

## ROC Curve

In [None]:
y_pred_proba = model.predict_proba(x_test)
from sklearn.metrics import roc_curve
from sklearn .metrics import roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])
plt.figure(figsize=(6,4))
plt.plot(fpr,tpr,'-g',linewidth=1)
plt.plot([0,1], [0,1], 'k--' )
plt.title('ROC Curve')
plt.xlabel("False Positive Rate")
plt.ylabel('True Positive Rate')
plt.show()
print(roc_auc_score(y_test, y_pred))

## Cross Validation Score

In [None]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(model, x_train, y_train, cv = 5, scoring='accuracy')
# print('Cross-validation scores:{}'.format(scores))
# print('Average cross-validation score: {}'.format(scores.mean()))

## Predictions on Testing Set

In [None]:
pred = model.predict_proba(test[features])
pred = pred[:, 1]

## Submission

In [None]:
submission['mutation'] = pred
submission.to_csv('submission4.csv', index=False)