In [1]:
import pandas as pd 

data = pd.read_csv('preprocessed_essay.csv')


In [2]:
good_data = data[0:76]
bad_data = data[174:]


In [11]:
good_data = pd.DataFrame(good_data)
bad_data = pd.DataFrame(bad_data)
good_bad_data = pd.concat([good_data, bad_data]).reset_index().drop(columns = ['index'])

In [13]:
good_bad_data.to_csv('good_bad_data.csv',index=False)

In [23]:
good_bad_data = pd.read_csv('good_bad_data.csv',encoding='latin-1')
good_bad_data

Unnamed: 0,text,label
0,my family consists of four members my father m...,
1,in the summer of the fourth grade an ai resear...,
2,i was impressed by watching cleaning movies re...,
3,i want to contribute to innovation with statis...,
4,i have developed an attitude that not only con...,
...,...,...
214,student motivation after working part time at ...,
215,new ideas using sensitivity and carefulness i ...,
216,unstoppable crisis for development there were ...,
217,new myth through open mind and positive thinki...,


In [26]:
def passornotpass(index):
    if 0 <= index <=76:
        return '합격'
    else:
        return '불합격'

good_bad_data['labels'] = good_bad_data.index.map(passornotpass)

In [31]:
good_bad_data = good_bad_data.drop(columns = ['label'])

KeyError: "['label'] not found in axis"

In [35]:
import pandas as pd 

good_bad_data.to_csv('good_bad.csv',index=False)

In [39]:
data = pd.read_csv('good_bad.csv', encoding='latin-1')
data

Unnamed: 0,text,labels
0,my family consists of four members my father m...,í©ê²©
1,in the summer of the fourth grade an ai resear...,í©ê²©
2,i was impressed by watching cleaning movies re...,í©ê²©
3,i want to contribute to innovation with statis...,í©ê²©
4,i have developed an attitude that not only con...,í©ê²©
...,...,...
214,student motivation after working part time at ...,ë¶í©ê²©
215,new ideas using sensitivity and carefulness i ...,ë¶í©ê²©
216,unstoppable crisis for development there were ...,ë¶í©ê²©
217,new myth through open mind and positive thinki...,ë¶í©ê²©


In [41]:
texts = data['text'].astype(str).tolist()
labels = data['labels'].tolist()

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize

In [45]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.2, random_state= 123)


In [52]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [54]:
tagged_data = [TaggedDocument(words = word_tokenize(doc.lower()), tags=[str(i)]) for i , doc in enumerate(X_train)]

In [55]:
doc2vec_model = Doc2Vec(
    vector_size = 100,
    window = 5,
    min_count = 2,
    workers=4,
    epochs=40,
    dm=1
)

In [56]:
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples = doc2vec_model.corpus_count, epochs = doc2vec_model.epochs)

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [59]:
X_train_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_train]
X_test_vectors = [doc2vec_model.infer_vector(word_tokenize(doc.lower())) for doc in X_test]

In [63]:
classifier = RandomForestClassifier(n_estimators=100, random_state=123)
classifier.fit(X_train_vectors, y_train)

In [64]:
y_pred = classifier.predict(X_test_vectors)

In [68]:
print(classification_report(y_test,y_pred, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

   ë¶í©ê²©       0.76      1.00      0.86        31
      í©ê²©       1.00      0.23      0.38        13

    accuracy                           0.77        44
   macro avg       0.88      0.62      0.62        44
weighted avg       0.83      0.77      0.72        44



In [69]:
from sklearn.model_selection import GridSearchCV

In [70]:
rf = RandomForestClassifier(random_state=123)

In [71]:
param_grid = {
    'n_estimators': [100,200,300],
    'max_depth': [10,20,30],
    'min_samples_split': [2,5,10],
    'min_samples_leaf': [1,2,4],
    'max_features':['sqrt','log2']
}

In [73]:
grid_search = GridSearchCV(estimator=rf,
                           param_grid=param_grid,
                           cv=3,
                           verbose=2)

In [74]:
grid_search.fit(X_train_vectors, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.2s
[CV] END max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=3

In [76]:
print('Best parameters found :', grid_search.best_params_)

Best parameters found : {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [77]:
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

In [78]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86        31
           1       0.80      0.31      0.44        13

    accuracy                           0.77        44
   macro avg       0.78      0.64      0.65        44
weighted avg       0.78      0.77      0.74        44



In [83]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [84]:
rf = RandomForestClassifier(random_state=123)

In [85]:
param_dist = {
    'n_estimators': randint(100,500),
    'max_depth': randint(10,50),
    'min_samples_split': randint(2,11),
    'min_samples_leaf': randint(1,5),
    'max_features':['sqrt','log2']
}

In [86]:
random_search = RandomizedSearchCV(estimator=rf,
                                   param_distributions= param_dist,
                                   n_iter = 100,
                                   cv=3,
                                   random_state = 123,
                                   verbose=2)


In [87]:
random_search.fit(X_train_vectors, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=12, max_features=sqrt, min_samples_leaf=3, min_samples_split=8, n_estimators=117; total time=   0.0s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.3s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.3s
[CV] END max_depth=29, max_features=sqrt, min_samples_leaf=4, min_samples_split=8, n_estimators=325; total time=   0.2s
[CV] END max_depth=42, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=324; total time=   0.2s
[CV] END max_depth=42, max_features=log2, min_samples_leaf=3, min_samples_split=2, n_estimators=3

In [91]:
print('Best parameter found:', random_search.best_params_)

Best parameter found: {'max_depth': 26, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 146}


In [93]:
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test_vectors)

In [94]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.97      0.85        31
           1       0.75      0.23      0.35        13

    accuracy                           0.75        44
   macro avg       0.75      0.60      0.60        44
weighted avg       0.75      0.75      0.70        44

