# Description

- Train a model to make predictions

to test the notebook
- papermill notebooks/train.ipynb notebooks/output/code_exec.ipynb --log-output

# Import libraries and define functions and paths

## libraries

In [1]:
import pandas as pd
import datetime
import os
import sqlalchemy as db
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report

## functions

In [2]:
def get_connection():
    try:
        os.environ["DB_HOST"]
    except:
        os.environ["DB_HOST"] = "localhost"
        
    print(os.environ["DB_HOST"])



    # specify database configurations
    config = {
        'host': os.environ['DB_HOST'],
        'port': 3306,
        'user': 'mendes',
        'password': 'test',
        'database': 'test_db'
    }
    db_user = config.get('user')
    db_pwd = config.get('password')
    db_host = config.get('host')
    db_port = config.get('port')
    db_name = config.get('database')
    # specify connection string
    connection_str = f'mysql+pymysql://{db_user}:{db_pwd}@{db_host}:{db_port}/{db_name}'
    # connect to database
    engine = db.create_engine(connection_str)
    return engine

## Paths

# Connect SQL DB and transfer data

In [3]:
engine = get_connection()

localhost


## Process dataset

In [4]:
with engine.connect() as conn:
    sql = f'SELECT * FROM train_preprocessed;'
    train = pd.read_sql(sql,conn)
    
    sql = f'SELECT * FROM test_preprocessed;'
    test = pd.read_sql(sql,conn)

In [5]:
X_train,y_train = train.drop(['label'],axis=1), train['label']
X_test,y_test = test.drop(['label'],axis=1), test['label']

In [6]:
path = os.path.abspath(os.getcwd())+'/'

if 'notebooks' not in path:
    path = path+'/notebooks/'
    
print('THIS IS THE:'+path)

clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

with open(path+'model.p','wb') as handle:
    pickle.dump(clf,handle,protocol=pickle.HIGHEST_PROTOCOL) 

THIS IS THE:/Users/mendes/tech_interview/Kensho/applications/train/notebooks/


In [7]:
f1_score(y_test,clf.predict(X_test),average='weighted')

1.0

In [9]:
print(classification_report(y_test,clf.predict(X_test)))
df=pd.DataFrame(precision_recall_fscore_support(y_test,clf.predict(X_test)))
with engine.connect() as conn:
    df.to_sql('model_stats',conn,if_exists='replace')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       450
           1       1.00      1.00      1.00       450
           2       1.00      1.00      1.00       450

    accuracy                           1.00      1350
   macro avg       1.00      1.00      1.00      1350
weighted avg       1.00      1.00      1.00      1350

