In [1]:
import sys
import os
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import joblib

sys.path.append(os.path.abspath(os.path.join(os.path.dirname('textpreprocessor.py'), '..')))

OUTPUT_DIR = "Output/data_sample10000/"
OUTPUT_DIR_FULL = "Output/data_full"
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [2]:
# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from textpreprocessor import TextPreprocessor

# Initialize the Text Pre Processor class
processor = TextPreprocessor()

# Load data
df_train, df_test = processor.load_data()
# df_train_f= processor.filter_by_length_of_sentence(df_train)
# df_test_f = processor.filter_by_length_of_sentence(df_test)
df_train_f= processor.limit_length_of_sentence(df_train, 100)

# Preprocess data
df_train = processor.preprocess(df_train)
df_test = processor.preprocess(df_test)
df_train = processor.sampling_data(df_train)
df_test = processor.sampling_data(df_test, 2000)

# df_train_f = processor.preprocess(df_train_f)
# print('process:\n',df_train_f.head())
df_train_f = processor.sampling_data(df_train_f)

# Split data
X_train, y_train= processor.split_data(df_train)
X_test, y_test = processor.split_data(df_test)

X_train_f = df_train_f['review_l']
y_train_f = df_train_f['polarity']
print(len(y_train_f))

X_train_tfidf, X_test_tfidf = processor.vectorize_text(X_train, X_test)
X_train_tfidf_f, X_test_tfidf_f = processor.vectorize_text(X_train_f, X_test)


joblib.dump(X_train_tfidf, os.path.join(OUTPUT_DIR, 'X_train_tfidf.pkl'))
joblib.dump(X_test_tfidf, os.path.join(OUTPUT_DIR, 'X_test_tfidf.pkl'))
joblib.dump(y_train, os.path.join(OUTPUT_DIR, 'y_train.pkl'))
joblib.dump(y_test, os.path.join(OUTPUT_DIR, 'y_test.pkl'))


joblib.dump(X_train_tfidf_f, os.path.join(OUTPUT_DIR, 'X_train_tfidf_f.pkl'))
joblib.dump(X_test_tfidf_f, os.path.join(OUTPUT_DIR, 'X_test_tfidf_f.pkl'))
joblib.dump(y_train_f, os.path.join(OUTPUT_DIR, 'y_train_f.pkl'))
# joblib.dump(y_test_f, os.path.join(OUTPUT_DIR, 'y_test_f.pkl'))

print("Data saved successfully!")

2024-09-24 13:22:20.807358: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-24 13:22:20.815852: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-24 13:22:20.818391: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-24 13:22:20.825032: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


10000
Data saved successfully!


In [30]:
# 데이터를 불러오기
X_train_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_test_tfidf.pkl'))
y_train = joblib.load(os.path.join(OUTPUT_DIR, 'y_train.pkl'))
y_test = joblib.load(os.path.join(OUTPUT_DIR, 'y_test.pkl'))

X_train_tfidf_f = joblib.load(os.path.join(OUTPUT_DIR, 'X_train_tfidf_f.pkl'))
X_test_tfidf_f = joblib.load(os.path.join(OUTPUT_DIR, 'X_test_tfidf_f.pkl'))
y_train_f = joblib.load(os.path.join(OUTPUT_DIR, 'y_train_f.pkl'))
y_test_f = joblib.load(os.path.join(OUTPUT_DIR, 'y_test_f.pkl'))

print("Data loaded successfully!")

Data loaded successfully!


In [6]:
# print(f'Length of Train: {len(df_train)} / Filtered: {len(df_train_f)}')

In [3]:
df_train['word_count'] = df_train['review'].apply(lambda x: len(x.split()))
max_word_count = df_train['word_count'].max()

df_train_f['word_count'] = X_train_f.apply(lambda x: len(x.split()))
max_word_count_f = df_train_f['word_count'].max()

# 최대 단어 수 출력
print(f"Maximum number of words in the 'review' column: {max_word_count} / Filtered: {max_word_count_f}")

Maximum number of words in the 'review' column: 195 / Filtered: 100


In [None]:
# # 메모리 사용량 계산
# memory_usage = df_train.memory_usage(deep=True).sum()

# # 메모리 사용량을 MB 단위로 변환
# memory_usage_MB = memory_usage / (1024 ** 2)

# # 메모리 사용량 계산
# memory_usage_f = df_train_f.memory_usage(deep=True).sum()

# # 메모리 사용량을 MB 단위로 변환
# memory_usage_MB_f = memory_usage_f / (1024 ** 2)

# # 메모리 사용량 출력
# print(f"Memory usage of df_train: {memory_usage_MB:.2f}MB / Filtered: {memory_usage_MB_f:.2f}MB")

Base Model Accuracy: 84.23%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      4987
           1       0.84      0.84      0.84      5013

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

In [6]:
X_train_f.shape

(10000,)

In [7]:
# Model building
# Using Logistic Regression for sentiment classification
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 84.85%
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.84      0.85      1023
           1       0.83      0.86      0.85       977

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



Base Model (Filtered by length of sentence DataSet) Accuracy: 84.96%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.84      4853
           1       0.85      0.86      0.85      5147

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

In [8]:
# Model building
# Using Logistic Regression for sentiment classification
model = LogisticRegression()
model.fit(X_train_tfidf_f, y_train)

y_pred = model.predict(X_test_tfidf_f)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 84.75%
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.83      0.85      1023
           1       0.83      0.86      0.85       977

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



: 

In [43]:
param_dist= {
    'C':np.logspace(-1,3,30),
    'solver':['liblinear', 'saga','lbgfs'],
    'max_iter':[200, 300],
    'penalty':['l1' ,'l2']
}

random_search = RandomizedSearchCV(model_f, param_distributions=param_dist, n_iter=300, cv=5, verbose=1, random_state=42)

RandomSearchCV Model Accuracy: 85.04%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85      4853
           1       0.85      0.86      0.86      5147

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

In [44]:
random_search.fit(X_train_tfidf_f, y_train_f)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [13]:
# Making predictions
y_pred = random_search.predict(X_test_tfidf)

In [45]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Hypertuning Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Hypertuning Model Accuracy: 84.23%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.84      0.84      4987
           1       0.84      0.84      0.84      5013

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [46]:
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.0235896477251556}


In [47]:
model_f = LogisticRegression(**random_search.best_params_)

model_f.fit(X_train_tfidf_f, y_train_f)

In [50]:
y_pred = model.predict(X_test_tfidf)

RandomSearchCV Model Accuracy: 85.04%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85      4853
           1       0.85      0.86      0.86      5147

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

In [51]:
# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Hypertuning Model Accuracy: {accuracy_f * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test_f, y_pred_f))

Hypertuning Model Accuracy: 84.23%
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.60      0.54      4853
           1       0.51      0.40      0.45      5147

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.49     10000
weighted avg       0.50      0.50      0.49     10000



In [19]:
import joblib
import os

# 저장할 디렉토리 설정 (필요 시 생성)
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Best parameters 저장
best_params_path = os.path.join(OUTPUT_DIR, 'best_params.pkl')
joblib.dump(random_search.best_params_, best_params_path)

print(f"Best parameters saved at: {best_params_path}")

# 최적의 파라미터를 사용하여 모델 생성
model_f = LogisticRegression(**random_search.best_params_)

# 모델 저장
model_path = os.path.join(OUTPUT_DIR, 'best_logistic_regression_model.pkl')
joblib.dump(model_f, model_path)

print(f"Best Logistic Regression model saved at: {model_path}")

# Best parameters 불러오기
best_params = joblib.load(best_params_path)
print(f"Loaded Best Parameters: {best_params}")

# Best model 불러오기
best_model = joblib.load(model_path)
print("Loaded Best Logistic Regression Model.")

Best parameters saved at: Output/data_sample10000/best_params.pkl
Best Logistic Regression model saved at: Output/data_sample10000/best_logistic_regression_model.pkl
Loaded Best Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.0235896477251556}
Loaded Best Logistic Regression Model.


Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.0235896477251556}

In [52]:
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.0235896477251556}


In [53]:
from sklearn.model_selection import GridSearchCV
# RandomizedSearchCV로 찾은 최적 범위를 기반으로 GridSearch 설정
param_grid = {
    'C': np.linspace(random_search.best_params_['C'] - 1, random_search.best_params_['C'] + 1, 50),  # 좁은 범위로 설정
    'solver': [random_search.best_params_['solver']],  # 최적의 solver 선택
    'max_iter': [200, 300],  # 더 정밀하게
    'penalty': [random_search.best_params_['penalty']]  # 최적 penalty 선택
}

grid_search = GridSearchCV(
    model_f,
    param_grid=param_grid,
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_tfidf_f, y_train_f)

print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters from GridSearchCV: {'C': 1.9215488313986249, 'max_iter': 200, 'penalty': 'l2', 'solver': 'liblinear'}


Base Model Whole DataSet Accuracy: 88.07%
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88    136238
           1       0.88      0.89      0.88    144076

    accuracy                           0.88    280314
   macro avg       0.88      0.88      0.88    280314
weighted avg       0.88      0.88      0.88    280314

In [26]:
# 데이터를 불러오기
X_train_tfidf = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'X_test_tfidf.pkl'))
y_train = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'y_train.pkl'))
y_test = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'y_test.pkl'))

X_train_tfidf_f = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'X_train_tfidf_f.pkl'))
X_test_tfidf_f = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'X_test_tfidf_f.pkl'))
y_train_f = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'y_train_f.pkl'))
y_test_f = joblib.load(os.path.join(OUTPUT_DIR_FULL, 'y_test_f.pkl'))

print("Full DataSet loaded successfully!")

Full DataSet loaded successfully!


In [59]:
model_f = LogisticRegression()
model_f.fit(X_train_tfidf_f, y_train_f)

y_pred = model_f.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Hypertuning Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Hypertuning Model Accuracy: 53.76%
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.62      0.57      4987
           1       0.55      0.45      0.50      5013

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.53     10000
weighted avg       0.54      0.54      0.53     10000



GridSearchCV Model Whole DataSet Accuracy: 88.24%
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.88      0.88    136238
           1       0.88      0.89      0.89    144076
    accuracy          -         -      0.88    280314
   macro avg       0.88      0.88      0.88    280314
weighted avg       0.88      0.88      0.88    280314


In [57]:
model = LogisticRegression(**grid_search.best_params_)
model.fit(X_train_tfidf_f, y_train_f)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Hypertuning Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Hypertuning Model Accuracy: 53.74%
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.64      0.58      4987
           1       0.55      0.43      0.48      5013

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.53     10000
weighted avg       0.54      0.54      0.53     10000



In [32]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model_f.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Hypertuning Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Hypertuning Model Accuracy: 54.45%
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.73      0.61      4987
           1       0.57      0.36      0.44      5013

    accuracy                           0.54     10000
   macro avg       0.55      0.54      0.53     10000
weighted avg       0.55      0.54      0.53     10000



In [33]:
model_f = LogisticRegression(**grid_search.best_params_)
model_f.fit(X_train_tfidf, y_train)

y_pred_f = model_f.predict(X_test_tfidf_f)

accuracy_f = accuracy_score(y_test_f, y_pred_f)
print(f"Hypertuning Model Accuracy: {accuracy_f * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test_f, y_pred_f))

Hypertuning Model Accuracy: 51.49%
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.56      0.53    136238
           1       0.53      0.47      0.50    144076

    accuracy                           0.51    280314
   macro avg       0.52      0.52      0.51    280314
weighted avg       0.52      0.51      0.51    280314

