# Model Training

In [1]:
# imports
import pandas as pd
import numpy as np
import mlflow
from mlflow.models.signature import infer_signature
from scipy.sparse import vstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model

### Data Loading

In [2]:
def count_vec(vec_params, df_all, df_train, df_test):
    #Count Vectorizer:
    vectorizer1 = CountVectorizer(**vec_params) #initialise the vectorizer
    vectorizer1.fit(df_all['sentence'].tolist()) #fit vectorizer to entire corpus

    # vectorize test and train separately
    df_train['count_vector'] = df_train.apply(lambda x: vectorizer1.transform([x['sentence']]), axis=1)
    df_test['count_vector'] = df_test.apply(lambda x: vectorizer1.transform([x['sentence']]), axis=1)
    
    return df_train, df_test

In [3]:
def tfidf_vec(vec_params, df_all, df_train, df_test):
    # TF-IDF Vectorizer:
    vectorizer2 = TfidfVectorizer(**vec_params) #initialise the vectorizer
    vectorizer2.fit(df_all['sentence'].tolist()) #fit vectorizer to entire corpus

    # vectorize test and train separately
    df_train['tf_idf_vector'] = df_train.apply(lambda x: vectorizer2.transform([x['sentence']]), axis=1)
    df_test['tf_idf_vector'] = df_test.apply(lambda x: vectorizer2.transform([x['sentence']]), axis=1)
    
    return df_train, df_test

Pull out the data from the df and save in numpy vectors for easy use with SKLearn:

In [4]:
def data_vec(vec, vec_params):
    
    df_all = pd.read_csv('./data/all.csv')
    df_train = pd.read_csv('./data/train.csv')
    df_test = pd.read_csv('./data/test.csv')
    
    Y_train = df_train['avg_score'].to_numpy()
    Y_test = df_test['avg_score'].to_numpy()
    
    if vec=='count':
        df_train, df_test = count_vec(vec_params, df_all, df_train, df_test)
        X_train = vstack(df_train['count_vector'].to_numpy())
        X_test = vstack(df_test['count_vector'].to_numpy())
    elif vec=='tfidf':
        df_train, df_test = tfidf_vec(vec_params, df_all, df_train, df_test)
        X_train = vstack(df_train['tf_idf_vector'].to_numpy())
        X_test = vstack(df_test['tf_idf_vector'].to_numpy())
    else:
        raise ValueError('Vectorizer not chosen')
        
    return X_train, X_test, Y_train, Y_test
        

### Mlflow Setup

- Please run the CLI commands in the 010-MLflow-setup notebook to initiate the MLflow server.
- The MLflow UI can be accessed [here](http://127.0.0.1:5000).

In [5]:
tracking_uri = "http://127.0.0.1:5000/"
mlflow.set_tracking_uri(tracking_uri)

## Models experimented with:
1. Linear Regression
2. 

### 1. Linear Regression

In [6]:
# create new experiment with MLflow:
experiment_name = "linear_regression"
mlflow.set_experiment(experiment_name);

2022/11/30 17:34:47 INFO mlflow.tracking.fluent: Experiment with name 'linear_regression' does not exist. Creating a new experiment.


Let's set some vectorizer and model parameters:

In [7]:
vectorizer = 'count' # count or tfidf
vec_params = {
    'lowercase' : True, #Convert all characters to lowercase before tokenizing
    'stop_words': 'english', #Use sklearn built in corpus for stop word removal
    'max_df': 1.0, #maximum document frequency: we can ignore words which occur frequently
    'min_df': 0.0, #minimum document frequency: we can ignore words which occur infrequently
    'analyzer' : 'word', #tokenize to words
    'ngram_range': (1,1),
}

model_params = {
    
}

In [8]:
lin_reg_model = linear_model.LinearRegression()

with mlflow.start_run() as run:
    # log parameters in Mlflow:
    mlflow.log_param("vectorizer", vectorizer)
    mlflow.log_params(vec_params)
    mlflow.log_params(model_params)
    
    # data
    X_train, X_test, Y_train, Y_test = data_vec(vectorizer, vec_params)
    
    # train
    lin_reg_model = linear_model.LinearRegression(**model_params)
    lin_reg_model.fit(X_train, Y_train)
    
    # log model
    signature = infer_signature(X_train, lin_reg_model.predict(X_train[0:10]))
    mlflow.sklearn.log_model(lin_reg_model, "linear_regression_model", signature=signature)
    
    # evaluate
    Y_pred = lin_reg_model.predict(X_test)
    r2score = r2_score(Y_test, Y_pred)
    MSE = mean_squared_error(Y_test, Y_pred)
    print("R2 score : %.2f" % r2score)
    print("Mean squared error: %.2f" % MSE)
    
    # log metrics
    mlflow.log_metric("R2 score", r2score)
    mlflow.log_metric("Mean squared error", MSE)



R2 score : -7.52
Mean squared error: 14.95
