# Report - Part 1

This script generates eight best machine learning models

## Import Libraries

In [None]:
import os
import pickle
import warnings
import pandas as pd
# data preprocessing
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from spacy.lang.en.stop_words import STOP_WORDS
# feature extraction
from keras.utils import np_utils
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier,
                              VotingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score
from sklearn.metrics.classification import log_loss
from sklearn.model_selection import (GridSearchCV, StratifiedKFold,
                                     learning_curve, train_test_split)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings("ignore")

## Import Data

In [None]:
data = pd.read_csv("../data/dataset/training_variants.zip")
print("Number of data points : ", data.shape[0])
print("Number of features : ", data.shape[1])
print("Features : ", data.columns.values)

data_text = pd.read_csv(
    "../data/dataset/training_text.zip",
    sep="\|\|",
    engine="python",
    names=["ID", "TEXT"],
    skiprows=1,
)
print("Number of data points : ", data_text.shape[0])
print("Number of features : ", data_text.shape[1])
print("Features : ", data_text.columns.values)

## Data Preprocessing

In [None]:
tokenizer = RegexpTokenizer("\w+'?\w+|\w+")
stop_words = stopwords.words("english")
stop_words = set(stop_words).union(STOP_WORDS)
nlp = spacy.load("en", disable=["parser", "tagger", "ner"])
def make_token(x):
    """ Tokenize the text (remove punctuations and spaces)"""
    return tokenizer.tokenize(str(x))
def remove_stopwords(x):
    return [token for token in x if token not in final_stop_words]
def lemmatization(x):
    lemma_result = []
    for words in x:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result
def pipeline(total_text, index, column):
    """ A pipeline to process text data """
    if type(total_text) is str:
        total_text = total_text.lower()
        total_text = make_token(total_text)
        total_text = remove_stopwords(total_text)
        total_text = lemmatization(total_text)
        string = " ".join(total_text)
        data_text[column][index] = string
for index, row in data_text.iterrows():
    if type(row["TEXT"]) is str:
        pipeline(row["TEXT"], index, "TEXT")
    else:
        print("there is no text description for id:", index)

### merge genes, variations and text data by ID
result = pd.merge(data, data_text, on="ID", how="left")
result.loc[result["TEXT"].isnull(), "TEXT"] = result["Gene"] + " " + result["Variation"]
result.Gene = result.Gene.str.replace("\s+", "_")
result.Variation = result.Variation.str.replace("\s+", "_")

## write to pickle
pd.to_pickle(result, "result_non_split.pkl")

## Feature Extraction

In [None]:
maxFeats = 10000

tfidf = TfidfVectorizer(
    min_df=5,
    max_features=maxFeats,
    ngram_range=(1, 2),
    analyzer="word",
    stop_words="english",
    token_pattern=r"\w+",
)
tfidf.fit(result["TEXT"])

cvec = CountVectorizer(
    min_df=5,
    ngram_range=(1, 2),
    max_features=maxFeats,
    analyzer="word",
    stop_words="english",
    token_pattern=r"\w+",
)
cvec.fit(result["TEXT"])

# try n_components between 360-390
svdT = TruncatedSVD(n_components=390, n_iter=5)
svdTFit = svdT.fit_transform(tfidf.transform(result["TEXT"]))


def buildFeatures(df):
    """This is a function to extract features, df argument should be
    a pandas dataframe with only Gene, Variation, and TEXT columns"""

    temp = df.copy()

    print("Encoding...")
    temp = pd.get_dummies(temp, columns=["Gene", "Variation"], drop_first=True)

    print("TFIDF...")
    temp_tfidf = tfidf.transform(temp["TEXT"])

    print("Count Vecs...")
    temp_cvec = cvec.transform(temp["TEXT"])

    print("Latent Semantic Analysis Cols...")
    del temp["TEXT"]

    tempc = list(temp.columns)

    temp_lsa_tfidf = svdT.transform(temp_tfidf)
    temp_lsa_cvec = svdT.transform(temp_cvec)

    for i in range(np.shape(temp_lsa_tfidf)[1]):
        tempc.append("lsa_t" + str(i + 1))
    for i in range(np.shape(temp_lsa_cvec)[1]):
        tempc.append("lsa_c" + str(i + 1))
    temp = pd.concat(
        [
            temp,
            pd.DataFrame(temp_lsa_tfidf, index=temp.index),
            pd.DataFrame(temp_lsa_cvec, index=temp.index),
        ],
        axis=1,
    )
    return temp, tempc


trainDf, traincol = buildFeatures(result[["Gene", "Variation", "TEXT"]])
trainDf.columns = traincol
pd.to_pickle(trainDf, "trainDf.pkl")





## Training Classifiers and Tuning Hyperparameter 

In [None]:
# result = pd.read_pickle("result_non_split.pkl")
labels = result.Class - 1
# trainDf = pd.read_pickle("trainDf.pkl")

# for cross Validation
kfold = StratifiedKFold(n_splits=5)


# split data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(
    trainDf, labels, test_size=0.2, random_state=5, stratify=labels
)

# encode labels
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
encoded_test_y = np_utils.to_categorical((le.inverse_transform(y_test)))

In [None]:
### 1. Support Vector Machines
svr = SVC(probability=True)
# Hyperparameter tuning - Grid search cross validation
svr_CV = GridSearchCV(
    svr,
    param_grid={
        "C": [0.1, 1, 10, 100],
        "gamma": [1, 0.1, 0.01, 0.001],
        "kernel": ["poly", "rbf", "sigmoid", "linear"],
        "tol": [1e-4],
    },
    cv=kfold,
    verbose=False,
    n_jobs=-1,
)
svr_CV.fit(X_train, y_train)
svr_CV_best = svr_CV.best_estimator_
print("Best score: %0.3f" % svr_CV.best_score_)
print("Best parameters set:", svr_CV.best_params_)

# save the model
filename = "svr_CV_best.sav"
pickle.dump(svr_CV_best, open(filename, "wb"))
## load the model
# with (open(filename, "rb")) as openfile:
# svr_CV_best = pickle.load(openfile)

In [None]:
### 2. Logistic Regression
logreg = LogisticRegression(multi_class="multinomial")
# Hyperparameter tuning - Grid search cross validation
logreg_CV = GridSearchCV(
    estimator=logreg,
    param_grid={"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]},
    cv=kfold,
    verbose=False,
)
logreg_CV.fit(X_train, y_train)
logreg_CV_best = logreg_CV.best_estimator_
print("Best score: %0.3f" % logreg_CV.best_score_)
print("Best parameters set:", logreg_CV.best_params_)

# save the model
filename = "logreg_CV_best.sav"
pickle.dump(logreg_CV_best, open(filename, "wb"))

In [None]:
### 3. k-Nearest Neighbors
knn = KNeighborsClassifier()
# Hyperparameter tuning - Grid search cross validation
param_grid = {"n_neighbors": range(2, 10)}
knn_CV = GridSearchCV(
    estimator=knn, param_grid=param_grid, cv=kfold, verbose=False
).fit(X_train, y_train)
knn_CV_best = knn_CV.best_estimator_
print("Best score: %0.3f" % knn_CV.best_score_)
print("Best parameters set:", knn_CV.best_params_)

# save the model
filename = "knn_CV_best.sav"
pickle.dump(knn_CV_best, open(filename, "wb"))

In [None]:
### 4. Random Forest
random_forest = RandomForestClassifier()
param_grid = {
    "bootstrap": [True, False],
    "max_depth": [5, 8, 10, 20, 40, 50, 60, 80, 100],
    "max_features": ["auto", "sqrt"],
    "min_samples_leaf": [1, 2, 4, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "n_estimators": [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
}
random_forest_CV = GridSearchCV(
    estimator=random_forest, param_grid=param_grid, cv=kfold, verbose=False, n_jobs=-1
)
random_forest_CV.fit(X_train, y_train)
random_forest_CV_best = random_forest_CV.best_estimator_
print("Best score: %0.3f" % random_forest_CV.best_score_)
print("Best parameters set:", random_forest_CV.best_params_)

# save the model
filename = "random_forest_CV_best.sav"
pickle.dump(random_forest_CV_best, open(filename, "wb"))

In [None]:
### 5. Adaboost
param_grid = {
    "base_estimator__max_depth": [5, 10, 20, 50, 100, 150, 200],
    "n_estimators": [100, 500, 1000, 1500, 2000],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1.0],
    "algorithm": ["SAMME", "SAMME.R"],
}
Ada_Boost = AdaBoostClassifier(DecisionTreeClassifier())
Ada_Boost_CV = GridSearchCV(
    estimator=Ada_Boost, param_grid=param_grid, cv=kfold, verbose=False, n_jobs=10
)
Ada_Boost_CV.fit(X_train, y_train)
Ada_Boost_CV_best = Ada_Boost_CV.best_estimator_
print("Best score: %0.3f" % Ada_Boost_CV.best_score_)
print("Best parameters set:", Ada_Boost_CV.best_params_)

# save the model
filename = "Ada_Boost_CV_best.sav"
pickle.dump(Ada_Boost_CV_best, open(filename, "wb"))

In [None]:
### 6. XGBoost
xgb_clf = xgb.XGBClassifier(objective="multi:softprob")
parameters = {
    "n_estimators": [200, 300, 400],
    "learning_rate": [0.001, 0.003, 0.005, 0.006, 0.01],
    "max_depth": [4, 5, 6],
}
xgb_clf_cv = GridSearchCV(
    estimator=xgb_clf, param_grid=parameters, n_jobs=-1, cv=kfold
).fit(X_train, y_train)
xgb_clf_cv_best = xgb_clf_cv.best_estimator_
print("Best score: %0.3f" % xgb_clf_cv.best_score_)
print("Best parameters set:", xgb_clf_cv.best_params_)

# Save the model
filename = "xgb_clf_cv_best.sav"
pickle.dump(xgb_clf_cv_best, open(filename, "wb"))

In [None]:
### 7. MLPClassifier
mlp = MLPClassifier()
param_grid = {
    "hidden_layer_sizes": [i for i in range(5, 25, 5)],
    "solver": ["sgd", "adam", "lbfgs"],
    "learning_rate": ["constant", "adaptive"],
    "max_iter": [500, 1000, 1200, 1400, 1600, 1800, 2000],
    "alpha": [10.0 ** (-i) for i in range(-3, 6)],
    "activation": ["tanh", "relu"],
}
mlp_GS = GridSearchCV(mlp, param_grid=param_grid, n_jobs=-1, cv=kfold, verbose=False)
mlp_GS.fit(X_train, y_train)
mlp_GS_best = mlp_GS.best_estimator_
print("Best score: %0.3f" % mlp_GS.best_score_)
print("Best parameters set:", mlp_GS.best_params_)

# save the model
filename = "mlp_GS_best.sav"
pickle.dump(mlp_GS_best, open(filename, "wb"))

In [None]:
### 8. Voting Classifier
Voting_ens = VotingClassifier(
    estimators=[
        ("log", logreg_CV_best),
        ("rf", random_forest_CV_best),
        ("knn", knn_CV_best),
        ("svm", svr_CV_best),
    ],
    n_jobs=-1,
    voting="soft",
)
Voting_ens.fit(X_train, y_train)

# save the model
filename = "Voting_ens.sav"
pickle.dump(Voting_ens, open(filename, "wb"))