In [14]:
import argparse
import os
import pickle
import sys
import time

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import SGDClassifier  # logistic regression
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB  # NB
from sklearn.neighbors import KNeighborsClassifier  # k-NN
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC, SVC  # linear SVM
from sklearn.tree import DecisionTreeClassifier  # DT

In [16]:
train_file = 'data/dji/train.csv'

In [43]:
def build_cls(ml_cls="NB"):
    print("- Construct the baseline...")
    start = time.time()
    if ml_cls == "kNN":
        KNeighborsClassifier(n_neighbors=5)
    elif ml_cls == "LR":
        # Logistic Regression
        classifier = SGDClassifier(verbose=5, loss='log', max_iter=100)
    elif ml_cls == "DT":
        classifier = DecisionTreeClassifier(criterion="entropy", random_state=0)
    elif ml_cls == "SVM":
        classifier = LinearSVC(verbose=5, C=5)
    elif ml_cls == "MLP":
        classifier = MLPClassifier(random_state=1, max_iter=100)
    elif ml_cls == "AB":
        classifier = AdaBoostClassifier()
    elif ml_cls == "GB":
        classifier = GradientBoostingClassifier(verbose=5)
    elif ml_cls == "RF":
        classifier = RandomForestClassifier(n_estimators=100, verbose=5)
    else:
        # DEFAULT: NB
        classifier = MultinomialNB()
        
    settings = []
    settings += [('scaler', StandardScaler())]
    settings += [('classifier', classifier)]
    model = Pipeline(settings)

    # parameters = {'kernel': ['linear'], 'C': [1, 10]}

    end = time.time()
    print("\t+ Done: %.4f(s)" % (end - start))
    return model

In [44]:
def train():
    data_train = pd.read_csv(train_file)
    x_traindev = np.array(data_train["Date"])
    y_traindev = np.array(data_train["Open"])
    X = np.vstack((x_traindev, y_traindev)).T
    
    y_traindev = LabelEncoder().fit_transform(y_traindev)

    pipeline = build_cls("SVM")
    
    print("- Train the baseline...")
    start = time.time()
    # model = GridSearchCV(pipeline, parameters, cv=PredefinedSplit(test_fold=dev_fold),
    #                      verbose=5,  scoring='f1_weighted')
    model = GridSearchCV(pipeline, {}, cv=2, verbose=5, scoring='f1_weighted')
    model.fit(X, y_traindev)
    end = time.time()
    print("\t+ Done: %.4f(s)" % (end - start))
    best_model = model.best_estimator_
    #save(best_model, args.model_name)
    return 'Yay'

In [45]:
print(train())

- Construct the baseline...
	+ Done: 0.0000(s)
- Train the baseline...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................... , score=0.002, total=   0.9s
[CV]  ................................................................
[LibLinear]

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


[CV] .................................... , score=0.002, total=   0.9s
[LibLinear]

[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s finished


	+ Done: 8.0924(s)
Yay


