# Config

In [None]:
# %load config.py


# ------------------------ PATH ------------------------
ROOT_DIR = ".."

DATA_DIR = f"{ROOT_DIR}/Data"

# ------------------------ DATA ------------------------
# provided data
ORI_DATA = f"{DATA_DIR}/winemag-data-130k-v2.csv"
TRAIN_DATA = f"{DATA_DIR}/train.csv"
TEST_DATA = f"{DATA_DIR}/test.csv"

# ------------------------ PARAM ------------------------

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2


# ------------------------ OTHER ------------------------
RANDOM_SEED = 2019


    


# Preprocessing

In [39]:
%%writefile preprocessing.py

import numpy as np
import pandas as pd
from config import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import confusion_matrix
from sklearn.manifold import MDS


# ---------------------------- Basic ----------------------------


def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)
            

# ------------------------ Preprocessing ------------------------

# drop the row of variety <= drop_condition
def drop(name_columns, drop_condition, data_old, data_new):
    '''drop form name_columns by drop_condition'''
    data_new = data_old
    for i in drop_condition:
        data_old = data_new
        x = data_old[name_columns] != i
        data_new = data_old[x]
    return data_new


# Split arrays or matrices into proportion train and test subsets 
def spilt_vals(data, proportion, **random_state):
    test = pd.DataFrame()
    proportion = 1 - proportion
    for i in data.variety.factorize()[1]:
        x = data['variety'] == i
        valid = data[x]
        valid = valid.sample(frac=proportion, **random_state)
        test = test.append(valid)
    data = data.drop(test.index)
    data = data.reset_index(drop=True)
    test = test.reset_index(drop=True)
    return data, test


# -------------------- Featurne Engineering ---------------------


def mds_count(train_texts, test_texts,n):
    df_texts = pd.concat([train_texts, test_texts])
    kwargs = {
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = CountVectorizer(**kwargs)
    df_texts = vectorizer.fit_transform(df_texts)
    df_texts = pd.DataFrame(df_texts.todense())
    corr = df_texts.T.corr()
    distance = corr.applymap(lambda x: 1-x**2)
    embedding = MDS(n_jobs=-1,n_components=n, dissimilarity='precomputed', random_state=RANDOM_SEED)
    mds = embedding.fit_transform(distance)
    train = mds[:len(train_texts), :]
    test = mds[len(train_texts)::, :]
    return train, test



def ngram_vectorize(train_texts, train_labels, test_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_test = vectorizer.transform(test_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_test = selector.transform(x_test).astype('float32')
    return x_train, x_test


#Confusion matrix
def con_matrix(df, y_test, pred_test):
    variety = df.variety.value_counts().index
    labels_variety = []
    for i in variety:
        data = df[df.variety == i]
        data = data.reset_index()
        labels_variety.append(data.labels_variety[0])
    confusion = confusion_matrix(y_test, pred_test, labels=labels_variety)
    index = pd.Index(variety, name="Real")
    confusion = pd.DataFrame(confusion, index=index, columns=variety)
    return confusion

def con_matrix_two(df, y_test, pred_test):
    labels = ['Red', 'White']
    confusion = confusion_matrix(y_test, pred_test)
    index = pd.Index(labels, name="Real")
    confusion = pd.DataFrame(confusion, index=index, columns=labels)
    return confusion




#-------------------- Ensemble Selection -----------------------



Overwriting preprocessing.py


# model

In [13]:
# %%writefile model.py

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
from model_param_space import *
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, space_eval

def error(pred, truch):
    return (1 - accuracy_score(truch, pred))

def objective(param_dict):
    param_dict = {'max_depth': int(param_dict['max_depth'])}
    clf = xgb.XGBClassifier(n_jobs=-1, **param_dict)
    score = cross_val_score(clf, scoring=error, cv=StratifiedKFold(), n_jobs=-1).mean()
    return score

space = {'max_depth': hp.quniform('max_depth', 2, 8, 1)}
    
   


# model_param_space

In [7]:
%%writefile model_param_space.py
import numpy as np
from hyperopt import hp

from config import *

## xgboost
xgb_random_seed = RANDOM_SEED
xgb_n_estimators_min = 100
xgb_n_estimators_max = 1000
xgb_n_estimators_step = 10

## classification with tree booster
param_space_clf_xgb_tree = {
    "booster": "gbtree",
    "objective": "multi:softprob",
    "n_estimators" : hp.quniform("n_estimators", xgb_n_estimators_min, xgb_n_estimators_max, xgb_n_estimators_step),
    "learning_rate" : hp.qloguniform("learning_rate", np.log(0.002), np.log(0.1), 0.002),
    "gamma": hp.loguniform("gamma", np.log(1e-10), np.log(1e1)),
    "reg_alpha" : hp.loguniform("reg_alpha", np.log(1e-10), np.log(1e1)),
    "reg_lambda" : hp.loguniform("reg_lambda", np.log(1e-10), np.log(1e1)),
    "min_child_weight": hp.loguniform("min_child_weight", np.log(1e-10), np.log(1e2)),
    "max_depth": hp.quniform("max_depth", 1, 10, 1),
    "subsample": hp.quniform("subsample", 0.1, 1, 0.05),
    "colsample_bytree": 1,
    "colsample_bylevel": hp.quniform("colsample_bylevel", 0.1, 1, 0.05),
    "n_jobs": -1,
    "seed": xgb_random_seed,
}


Overwriting model_param_space.py
