In [1]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.10 pyvi-0.1.1 sklearn-crfsui

In [2]:
# Import libraries
import pandas as pd
# Load & preprocess data
import sys
# Tokenizer
from pyvi import ViTokenizer
# Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Model
from sklearn.ensemble import RandomForestClassifier
# Grid search
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [4]:
train = pd.read_csv('/kaggle/input/vfnd-datasets/clean_train_vfnd.csv')
test = pd.read_csv('/kaggle/input/vfnd-datasets/test.csv')

In [5]:
train['text'] = train['text'].astype(str).apply(ViTokenizer.tokenize)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train['text'])
y_train = train['label']

In [6]:
# Define the parameter grid
param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [50, 100],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [5, 10],
    'max_features': ['sqrt'],
    'bootstrap': [True],
    'class_weight': ['balanced']
}

# Create the RandomForestClassifier
clf = RandomForestClassifier(bootstrap=True)

# Use StratifiedKFold for better handling of imbalanced classes
cv = StratifiedKFold(n_splits=5)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=cv, scoring='f1_weighted', n_jobs=-1, verbose=0)
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(best_params)

{'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 400}


In [7]:
# # Create a new model with the best parameters
# clf = RandomForestClassifier(n_estimators= 400, max_depth= 100, min_samples_split= 3, min_samples_leaf= 10, max_features= 'sqrt', bootstrap= True, class_weight= 'balanced')

# Create a new model with the best parameters
clf = RandomForestClassifier(**best_params)

# # Fit the model to the training data
clf.fit(X_train, y_train)

In [8]:
y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97        96
           1       0.99      0.95      0.97        81

    accuracy                           0.97       177
   macro avg       0.97      0.97      0.97       177
weighted avg       0.97      0.97      0.97       177



## Evaluate 

In [9]:
X_test = vectorizer.transform(test['text'])
y_test = test['label']
# Make predictions on the test set
y_test_pred = clf.predict(X_test)
# Compute accuracy
print('Accuracy on the test set:', accuracy_score(y_test, y_test_pred))
# Compute F1 score
print(classification_report(y_test, y_test_pred))

Accuracy on the test set: 0.7555555555555555
              precision    recall  f1-score   support

           0       0.90      0.67      0.77        27
           1       0.64      0.89      0.74        18

    accuracy                           0.76        45
   macro avg       0.77      0.78      0.76        45
weighted avg       0.80      0.76      0.76        45

