In [140]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

## The data has already been preprocessed

In [5]:
!pip install prefect

Collecting prefect
  Obtaining dependency information for prefect from https://files.pythonhosted.org/packages/38/2d/d627115fc33df492567319a238cd7e534b479aabbc82b8921aa4e6695580/prefect-2.16.7-py3-none-any.whl.metadata
  Using cached prefect-2.16.7-py3-none-any.whl.metadata (10 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect)
  Obtaining dependency information for apprise<2.0.0,>=1.1.0 from https://files.pythonhosted.org/packages/5a/ee/1668723154b73c6dd286e5e57ee358d64d87fb5707c7f52ab282f4e754f8/apprise-1.7.4-py3-none-any.whl.metadata
  Using cached apprise-1.7.4-py3-none-any.whl.metadata (44 kB)
Collecting asyncpg>=0.23 (from prefect)
  Obtaining dependency information for asyncpg>=0.23 from https://files.pythonhosted.org/packages/f2/39/f7e755b5d5aa59d8385c08be58726aceffc1da9360041031554d664c783f/asyncpg-0.29.0-cp311-cp311-win_amd64.whl.metadata
  Using cached asyncpg-0.29.0-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Collecting dateparser<2.0.0,>=1.1.1 (from prefect)
  Obtaining d

In [141]:
from prefect import task,flow

In [146]:
@task(name="Data loading")
def load_data(file_path):
    return pd.read_csv(file_path)


@task(name="Identifying input and output")
def split_inputs_output(data, inputs, output):
    X = data[inputs]
    y = data[output]
    return X, y

@task(name="Splitting data into test and train")
def split_train_test(X, y, test_size=0.25, random_state=0):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

@task(name="Feature extraction of Text data")
def preprocess_data(X_train, X_test, y_train, y_test):
    vocab = CountVectorizer()
    X_train_bow = vocab.fit_transform(X_train["Review"])
    X_test_bow = vocab.transform(X_test["Review"])
    return X_train_bow, X_test_bow, y_train, y_test

@task(name="Model training")
def train_model(X_train_bow, y_train, hyperparameters):
    clf = DecisionTreeClassifier(**hyperparameters)
    clf.fit(X_train_bow, y_train)
    return clf

@task(name="Evaluation of Model")
def evaluate_model(model, X_train_bow, y_train, X_test_bow, y_test):
    y_train_pred = model.predict(X_train_bow)
    y_test_pred = model.predict(X_test_bow)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    
    return train_score, test_score

In [147]:
@flow(name="Decision_Tree_2 Flow")
def workflow():
    DATA_PATH = "output.csv"
    INPUTS = 'Review'
    OUTPUT = 'sentiment'
    HYPERPARAMETERS = {'max_depth': 10}
    
    # Load data
    sentiment = load_data(DATA_PATH)
    
    # Identify Inputs and Output
    X, y = split_inputs_output(sentiment, INPUTS, OUTPUT)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)
    X_train=pd.DataFrame(X_train)
    X_test=pd.DataFrame(X_test)
    y_train=pd.DataFrame(y_train)
    y_test=pd.DataFrame(y_test)
    null_indices_train= X_train[X_train['Review'].isnull()].index
    X_train.drop(null_indices_train, inplace=True)
    y_train.drop(null_indices_train, inplace=True)


    null_indices_test= X_test[X_test['Review'].isnull()].index
    X_test.drop(null_indices_test, inplace=True)
    y_test.drop(null_indices_test,inplace=True)
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    y_test.reset_index(drop=True, inplace=True)

    #preprocessing of the data 
    X_train_bow, X_test_bow, y_train, y_test = preprocess_data(X_train, X_test, y_train, y_test)
    
    #model training based on decision tree algorithm
    model = train_model(X_train_bow, y_train, HYPERPARAMETERS)
    
    #train and test score
    train_score, test_score = evaluate_model(model, X_train_bow, y_train, X_test_bow, y_test)
    
    print("Train Score:", train_score)
    print("Test Score:", test_score)



 `@flow(name='my_unique_name', ...)`


In [148]:
if __name__ == "__main__":
    workflow()

Train Score: 0.868317918168992
Test Score: 0.848142924306535
