In [1]:
import pandas as pd 
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
import warnings 

warnings.filterwarnings(action='ignore')


In [2]:
data = []

with open('total.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    for row in reader:
        data.append(row)

print(data)

[['text', 'label'], ['My family consists of four members: my father, mother, older sister and me, who are office workers. During my school years, I was interested in reading books and continued to read books in a wide range of fields, especially reading classics such as the biography of great men, classics, and Eastern and Western philosophy. As a result, I have acquired the ability to understand and empathize with various human images, excellent reason and logic, while also building up my knowledge of the humanities and trying to imitate great men. Because of these traits, I made friendship with friends who like reading books and who have desirable moral senses and humility, and I studied based on my sincere lifestyle by going to school early in the morning and leaving school after self-study at night, following the example of sincere and honest parents. After reading Lee Ji-sung\'s "Dreaming Attic Room," I realized that by having a big dream, the criminal can be reborn as a great man

In [3]:
data = pd.DataFrame(data)
data1 = data.rename(columns=data.iloc[0])
data2 = data1.drop(data1.index[0])
data = data2

In [4]:
texts = data['text'].astype(str).tolist()
labels = data['label'].tolist()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=123)

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [7]:
tagged_data = [TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]) for i ,doc in enumerate(X_train)]

In [8]:
doc2vec_model = Doc2Vec(
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=40,
    dm=1
)

In [9]:
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples = doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
X_test_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]

classifier = RandomForestClassifier(n_estimators=100, random_state=123)
classifier.fit(X_train_vectors, y_train)
                                    

In [11]:
y_pred = classifier.predict(X_test_vectors)

In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.71      1.00      0.83         5

    accuracy                           0.75         8
   macro avg       0.86      0.67      0.67         8
weighted avg       0.82      0.75      0.71         8



In [13]:
from sklearn.model_selection import GridSearchCV

In [14]:
rf = RandomForestClassifier(random_state=123)

In [15]:
param_grid = {
    'n_estimators' : [100,200,300],
    'max_depth': [10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf':[1,2,4],
    'max_features':['sqrt','log2']
}

In [16]:
grid_search = GridSearchCV(estimator=rf,
                           param_grid= param_grid,
                           cv=5,
                           verbose=2)

In [17]:
grid_search.fit(X_train_vectors, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=2

In [18]:
print('Best parameters found:', grid_search.best_params_)

Best parameters found: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [19]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.71      1.00      0.83         5

    accuracy                           0.75         8
   macro avg       0.86      0.67      0.67         8
weighted avg       0.82      0.75      0.71         8



In [21]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [22]:
rf = RandomForestClassifier(random_state=123)

In [23]:
param_dist = {
    'n_estimators':randint(100,500),
    'max_depth': randint(10,50),
    'min_samples_split': randint(2,11),
    'min_samples_leaf': randint(1,5),
    'max_features':['sqrt', 'log2']
}

In [24]:
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=3,  random_state=123, verbose=2)


In [25]:
random_search.fit(X_train_vectors, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.2s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.2s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.2s
[CV] END max_depth=42, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=324; total time=   0.2s
[CV] END max_depth=42, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=3

In [26]:
print('Best parameters found:', random_search.best_params_)

Best parameters found: {'max_depth': 35, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 6, 'n_estimators': 196}


In [27]:
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.83      1.00      0.91         5

    accuracy                           0.88         8
   macro avg       0.92      0.83      0.85         8
weighted avg       0.90      0.88      0.87         8

