In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import sklearn
import seaborn as sns
import spacy
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import os
import json
from sklearn.model_selection import train_test_split

tqdm.pandas()

import warnings

warnings.filterwarnings("ignore")


In [None]:
df_with_unsup = pd.read_csv("unsupo_train.csv")
df_test =pd.read_csv("manually_test.csv")

In [None]:
len(df_with_unsup)

In [None]:
df_test

In [None]:
x_train = tuple(df_with_unsup['prepro'])
x_test = tuple(df_test['prepro'])
y_train = tuple(df_with_unsup['sentiment'])
y_test = tuple(df_test['sentiment_final'].copy())

# Bag of Words

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(analyzer='word', max_features=5000)

# train
x_bow_train = bow_vectorizer.fit_transform(x_train)
y_train = np.array(y_train)

# test
x_bow_test = bow_vectorizer.transform(x_test)
y_test = np.array(y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr_bow_model = LogisticRegression(class_weight='balanced')

hyper_parmas = {"max_iter": [1000, 2000, 3000],
                "C":[10,100,1000]}

grid_lr_bow_model = GridSearchCV(lr_bow_model, param_grid=hyper_parmas, cv=5, refit=True, return_train_score=True).fit(x_bow_train, y_train)

print(grid_lr_bow_model.best_score_)

estimator = grid_lr_bow_model.best_estimator_
pred_y_bow_lr = estimator.predict(x_bow_test)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, pred_y_bow_lr, average='binary')
print(f1_score)


print(classification_report(y_test, pred_y_bow_lr, target_names=['neg', 'pos'],digits=4))

print(accuracy_score(y_test, pred_y_bow_lr)) 

In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

#model train 
param_grid = {
     'C': [0.001, 0.01, 0.1, 1, 10, 100] 
     }

svm_origin = LinearSVC() 
n_estimators = 10
n_jobs = 2

svm_bow_model = GridSearchCV(svm_origin, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 3).fit(x_bow_train, y_train)
print(svm_bow_model.best_score_)

estimator = svm_bow_model.best_estimator_
pred_y_bow_svm = estimator.predict(x_bow_test)

In [None]:


precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, pred_y_bow_svm, average='binary')
print(f1_score)

print(classification_report(y_test, pred_y_bow_svm, target_names=['neg', 'pos'],digits=4))

print(accuracy_score(y_test, pred_y_bow_svm) 

In [None]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 10.0],
         }

nb_bow_model = GridSearchCV(MultinomialNB() , param_grid=params, n_jobs=-1, cv=5, verbose=5).fit(x_bow_train, y_train)

print(nb_bow_model.best_score_)

estimator = nb_bow_model.best_estimator_
pred_y_bow_nb = estimator.predict(x_bow_test)

In [None]:
# 예측 
print(classification_report(y_test, pred_y_bow_nb, target_names=['neg', 'pos'],digits=4))
# 예측 정확도
print("모델의 예측 정확도 :",accuracy_score(y_test, pred_y_bow_nb)) 

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=82)

xgb_param_grid = {
    'n_estimators' : [100, 200, 400, 600, 1000], 
    'learning_rate' : [0.05, 0.1, 0.15, 0.2], 
    'max_depth' : [4, 6, 8, 10],
    'gpu_id':[0],
    'tree_method':['gpu_hist'],
    'predictor':['gpu_predictor'],
}

xgb_grid_bow = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring = 'accuracy').fit(x_bow_train, y_train, verbose=1)

print(xgb_grid_bow.best_score_)

estimator = xgb_grid_bow.best_estimator_
pred_y_bow_xgb = estimator.predict(x_bow_test)

In [None]:
xgb_grid_bow.best_estimator_

In [None]:

print(classification_report(y_test, pred_y_bow_xgb, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_bow_xgb)

# TF-IDF

In [None]:
# tfidf embedding
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 0.0, analyzer='word', ngram_range=(1,3), max_features=3000)

# train
x_tfidf_train = tfidf_vectorizer.fit_transform(x_train)
y_train = np.array(y_train)

# test
x_tfidf_test = tfidf_vectorizer.transform(x_test)
y_test = np.array(y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


lr_tfidf_model = LogisticRegression(class_weight='balanced')

hyper_parmas = {"max_iter": [1000, 2000, 3000],
                "C":[0.01, 0.1, 1, 10]}

grid_lr_tfidf_model = GridSearchCV(lr_tfidf_model, param_grid=hyper_parmas, cv=5, refit=True, return_train_score=True).fit(x_tfidf_train, y_train)

print(grid_lr_tfidf_model.best_score_)

estimator = grid_lr_tfidf_model.best_estimator_
pred_y_tfidf_lr = estimator.predict(x_tfidf_test)

In [None]:
grid_lr_tfidf_model.best_estimator_

In [None]:
print(classification_report(y_test, pred_y_tfidf_lr, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_tfidf_lr))

In [None]:
#model train 
param_grid = {
     'C': [0.001, 0.01, 0.1, 1, 10, 100] 
     }

svm_origin = LinearSVC() 
n_estimators = 10
n_jobs = 2

svm_tfidf_model = GridSearchCV(svm_origin, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 3).fit(x_tfidf_train, y_train)
print(svm_tfidf_model.best_score_)

estimator = svm_tfidf_model.best_estimator_
pred_y_tfidf_svm = estimator.predict(x_tfidf_test)

In [None]:
svm_tfidf_model.best_estimator_

In [None]:

print(classification_report(y_test, pred_y_tfidf_svm, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_tfidf_svm)) 

In [None]:
params = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
         }

nb_tfidf_model = GridSearchCV(MultinomialNB() , param_grid=params, n_jobs=-1, cv=5, verbose=5).fit(x_tfidf_train, y_train)

print(nb_tfidf_model.best_score_)

estimator = nb_tfidf_model.best_estimator_
pred_y_tfidf_nb = estimator.predict(x_tfidf_test)

In [None]:
nb_tfidf_model.best_estimator_

In [None]:

print(classification_report(y_test, pred_y_tfidf_nb, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_tfidf_nb)) 

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=82)

xgb_param_grid = {
    'n_estimators' : [100, 200, 400, 600, 1000], 
    'learning_rate' : [0.05, 0.1, 0.15, 0.2], 
    'max_depth' : [4, 6, 8, 10],
    'gpu_id':[0],
    'tree_method':['gpu_hist'],
    'predictor':['gpu_predictor'],
}

xgb_grid_tfidf = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring = 'accuracy').fit(x_tfidf_train, y_train, verbose=1)

print(xgb_grid_tfidf.best_score_)

estimator = xgb_grid_tfidf.best_estimator_
pred_y_tfidf_xgb = estimator.predict(x_tfidf_test)

In [None]:
xgb_grid_tfidf.best_estimator_

In [None]:

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, pred_y_tfidf_xgb, average='binary')
print(f1_score)

print(classification_report(y_test, pred_y_tfidf_xgb, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_tfidf_xgb)) 

# FastText

In [None]:
from gensim.models import KeyedVectors

path_to_pretrained_model = "wiki.en.bin"
pretrained_model = KeyedVectors.load_word2vec_format("wiki.en.vec")

print(pretrained_model["word"])

In [None]:

def text_to_vector(text, model):
    word_vectors = []
    for word in text.split():
        if word in model:
            word_vectors.append(model[word])
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

x_train_vectors = np.array([text_to_vector(text, pretrained_model) for text in x_train])
x_test_vectors = np.array([text_to_vector(text, pretrained_model) for text in x_test])

In [None]:

lr_fasttext_model = LogisticRegression(class_weight='balanced')

hyper_parmas = {"max_iter": [1000, 2000, 3000],
                "C":[0.01, 0.1, 1, 10]}

grid_lr_fasttext_model = GridSearchCV(lr_fasttext_model, param_grid=hyper_parmas, cv=5, refit=True, return_train_score=True).fit(x_train_vectors, y_train)

print(grid_lr_fasttext_model.best_score_)

estimator = grid_lr_fasttext_model.best_estimator_
pred_y_fasttext_lr = estimator.predict(x_test_vectors)

# 예측 
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, pred_y_fasttext_lr, average='binary')
print(f1_score)

print(classification_report(y_test, pred_y_fasttext_lr, target_names=['neg', 'pos'],digits=4))
# 예측 정확도
print(accuracy_score(y_test, pred_y_fasttext_lr)) 

In [None]:
#model train 
param_grid = {
     'C': [0.001, 0.01, 0.1, 1, 10, 100] 
     }

svm_origin = LinearSVC() 
n_estimators = 10
n_jobs = 2

svm_fasttext_model = GridSearchCV(svm_origin, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 3).fit(x_train_vectors, y_train)
print(svm_fasttext_model.best_score_)

estimator = svm_fasttext_model.best_estimator_
pred_y_fasttext_svm = estimator.predict(x_test_vectors)

In [None]:

print(classification_report(y_test, pred_y_fasttext_svm, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_fasttext_svm)) 

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max 스케일링을 사용하여 임베딩 벡터 스케일링
scaler = MinMaxScaler()
x_train_vectors_scaled = scaler.fit_transform(x_train_vectors)
x_test_vectors_scaled = scaler.transform(x_test_vectors)

params = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
         }

nb_fasttext_model = GridSearchCV(MultinomialNB() , param_grid=params, cv=5,n_jobs=-1, verbose=3).fit(x_train_vectors_scaled, y_train)

print(nb_fasttext_model.best_score_)

estimator = nb_fasttext_model.best_estimator_
pred_y_fasttext_nb = estimator.predict(x_test_vectors_scaled)

In [None]:
# 예측 
print(classification_report(y_test, pred_y_fasttext_nb, target_names=['neg', 'pos'],digits=4))
# 예측 정확도
print(accuracy_score(y_test, pred_y_fasttext_nb)) 

In [None]:
xgb = XGBClassifier(random_state=82)

xgb_param_grid = {
    'n_estimators' : [100, 200, 400, 600, 1000], 
    'learning_rate' : [0.05, 0.1, 0.15, 0.2], 
    'max_depth' : [4, 6, 8, 10],
    'gpu_id':[0],
    'tree_method':['gpu_hist'],
    'predictor':['gpu_predictor'],
}

xgb_grid_fasttext = GridSearchCV(xgb, param_grid = xgb_param_grid, scoring = 'accuracy').fit(x_train_vectors, y_train, verbose=1)

print(xgb_grid_fasttext.best_score_)

estimator = xgb_grid_fasttext.best_estimator_
pred_y_fasttext_xgb = estimator.predict(x_test_vectors)

In [None]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, pred_y_fasttext_xgb, average='binary')
print(f1_score)

print(classification_report(y_test, pred_y_fasttext_xgb, target_names=['neg', 'pos'],digits=4))
print(accuracy_score(y_test, pred_y_fasttext_xgb)) 