## Base Model TF-IDF Vectorizer

In [None]:
import pandas as pd
import pathlib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

#### import data

In [None]:
train = pd.read_csv(str(pathlib.Path().resolve())+"/data" + "/train.csv", encoding="utf-8")
test = pd.read_csv(str(pathlib.Path().resolve())+"/data" + "/test.csv", encoding="utf-8")
valid = pd.read_csv(str(pathlib.Path().resolve())+"/data" + "/valid.csv", encoding="utf-8")
train.shape, test.shape, valid.shape

#### generate train data and labels Y

In [None]:
df = pd.concat([train, valid], axis=0)
df["premise"] = df["premise"].str.lower()
df["hypothesis"] = df["hypothesis"].str.lower()
train_Y = df['label']
train_Y.shape

#### TF-IDF Vectorizer

In [None]:
vectorizer = TfidfVectorizer()
train_tf_idf_premise_features = vectorizer.fit_transform(df['premise']).toarray()
train_tf_idf_hypothesis_features = vectorizer.fit_transform(df['hypothesis']).toarray()
train_tf_idf_premise = pd.DataFrame(train_tf_idf_premise_features)
train_tf_idf_hypothesis = pd.DataFrame(train_tf_idf_hypothesis_features)

train = pd.merge(train_tf_idf_premise, train_tf_idf_hypothesis, left_index=True, right_index=True)
train.head()

#### Train-Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_Y, test_size=0.2, random_state = 42)

### Random Forest Classifier by scikit learn

In [None]:
clf_model = RandomForestClassifier(n_estimators = 20000, max_depth=10, random_state = 42, n_jobs=-1, verbose=1)
clf_model.fit(X_train, y_train)
RandomForestClassifier_prediction = clf_model.predict(X_test)

In [None]:
y_pred = clf_model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

### XGBoost Classifier

In [None]:
model = XGBClassifier(objective="multi:softprob", max_depth=4, num_class=3, eval_metric='merror',
                        learning_rate=0.3, n_jobs=8,tree_method="hist", n_estimators=200, seed = 42)
model.fit(X_train, y_train, eval_set=[(X_train, y_train)])

In [None]:
preds = model.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, preds))