In [None]:
# Library classes
from NLPFlow.preprocessing import Preprocessor, Vectorizer
from NLPFlow.model import ModelTrainer

# Important tools
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Initializations
preprocessor = Preprocessor(model="large") # specify spacy model used ['large','medium','small']
v = Vectorizer(method='tfidf', ngram_range=(1,2), max_features=None) # you can use method = 'count' for n-grams
le = LabelEncoder()

# Read the data
df = pd.read_csv("data\\TestData.csv")
df = df.sample(100)
# Apply the 'preprocess' function that applies all preprocessing steps.
# Alternatively, you can use preprocessor.function(ex: preprocessor.remove_emails, preprocessor.remove_abbreviations, preprocessor.lemmatize)
df['Text'] = df['statement'].apply(preprocessor.preprocess)

# Fit the initialized vectorizer(Here: TFIDF)
X = v.fit_transform(df['Text'])

# Label Encode the y values(Optional for some ML models)
y = df['status']
y = le.fit_transform(y)

# Get train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Let NLPFlow choose the best model for you!
trainer = ModelTrainer(X_train, y_train, X_test, y_test)
best_model = trainer.tune_models()
print("\nBest Model is:\n")
print(best_model)

# Now you have the best model tuned and ready for use!