In [17]:
# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import os
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.exceptions import ConvergenceWarning

OUTPUT_DIR = "../Output/textpreprocess/240926/sample_step2_10000"

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [18]:
X_train_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_test_tfidf.pkl'))
y_train = joblib.load(os.path.join(OUTPUT_DIR, 'y_train.pkl'))
y_test = joblib.load(os.path.join(OUTPUT_DIR, 'y_test.pkl'))

print("Data loaded successfully!")

Data loaded successfully!


In [19]:
# Model building
# Using Logistic Regression for sentiment classification
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 83.38%
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.82      0.84       930
           1       0.82      0.84      0.83       893

    accuracy                           0.83      1823
   macro avg       0.83      0.83      0.83      1823
weighted avg       0.83      0.83      0.83      1823



In [20]:
train_accuracy = model.score(X_train_tfidf, y_train)
test_accuracy = model.score(X_test_tfidf, y_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Train Accuracy: 0.9038778424695155
Test Accuracy: 0.8337904552934723


In [21]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train_tfidf, y_train, cv=5)
print(f"Cross-validation mean score: {np.mean(scores)}, Standard deviation: {np.std(scores)}")

Cross-validation mean score: 0.8186318349415547, Standard deviation: 0.003877097237431504


In [22]:
# param_dist= {
#     'C':np.logspace(-3,3,50),
#     'solver':['saga'],
#     # 'solver':['liblinear', 'saga','lbfgs'],
#     # 'solver':['liblinear', 'saga','lbfgs'],
#     # 'solver':['liblinear', 'saga','lbfgs'],
#     'max_iter':[1000],
#     'penalty':['l1', 'l2']
# }

# random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=3, verbose=1, random_state=42, n_jobs=-1)

# random_search.fit(X_train_tfidf, y_train)

In [23]:
param_dist= {
    'C':np.logspace(-3,3,100),
    'solver':['liblinear', 'saga','lbfgs'],
    # 'solver':['liblinear', 'saga','lbfgs'],
    # 'solver':['liblinear', 'saga','lbfgs'],
    # 'solver':['liblinear', 'saga','lbfgs'],
    'max_iter':[100, 200, 300],
    'penalty':['l1', 'l2']
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=200, cv=3, verbose=1, random_state=42, n_jobs=-1)

random_search.fit(X_train_tfidf, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


93 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
93 fits failed with the following error:
Traceback (most recent call last):
  File "/home/woong/AI/Dissertation/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/woong/AI/Dissertation/.venv/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/woong/AI/Dissertation/.venv/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/woong/AI/Dissertation/.venv/lib/p

In [24]:
# Making predictions
y_pred = random_search.predict(X_test_tfidf)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Randomized Search Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

print("Best parameters found: ", random_search.best_params_)

Randomized Search Model Accuracy: 82.83%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.83       930
           1       0.82      0.83      0.83       893

    accuracy                           0.83      1823
   macro avg       0.83      0.83      0.83      1823
weighted avg       0.83      0.83      0.83      1823

Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.848035868435802}


Randomized Search Model Accuracy: 81.59%
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82       422
           1       0.80      0.82      0.81       398

    accuracy                           0.82       820
   macro avg       0.82      0.82      0.82       820
weighted avg       0.82      0.82      0.82       820

Best parameters found:  {'solver': 'sag', 'penalty': 'l2', 'max_iter': 3000, 'C': 1.2648552168552958}

Randomized Search Model Accuracy: 81.95%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.81      0.82       422
           1       0.80      0.83      0.82       398

    accuracy                           0.82       820
   macro avg       0.82      0.82      0.82       820
weighted avg       0.82      0.82      0.82       820

Best parameters found:  {'solver': 'saga', 'penalty': 'l2', 'max_iter': 1000, 'C': 0.49417133613238384}

In [25]:
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.848035868435802}


In [26]:
train_accuracy = random_search.score(X_train_tfidf, y_train)
test_accuracy = random_search.score(X_test_tfidf, y_test)

print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Train Accuracy: 0.9383719652861694
Test Accuracy: 0.8283049917718047


In [27]:
# from sklearn.model_selection import cross_val_score

# scores = cross_val_score(random_search, X_train_tfidf, y_train, cv=5)
# print(f"Cross-validation mean score: {np.mean(scores)}, Standard deviation: {np.std(scores)}")

In [28]:
import joblib
import os

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

best_params_path = os.path.join(OUTPUT_DIR, 'best_params_random.pkl')
joblib.dump(random_search.best_params_, best_params_path)

print(f"Best parameters saved at: {best_params_path}")

model = LogisticRegression(**random_search.best_params_)

model_path = os.path.join(OUTPUT_DIR, 'best_logistic_regression_model_random.pkl')
joblib.dump(model, model_path)

print(f"Best Logistic Regression model saved at: {model_path}")

best_params = joblib.load(best_params_path)
print(f"Loaded Best Parameters: {best_params}")

best_model = joblib.load(model_path)
print("Loaded Best Logistic Regression Model.")

Best parameters saved at: ../Output/textpreprocess/240926/sample_step2_10000/best_params_random.pkl
Best Logistic Regression model saved at: ../Output/textpreprocess/240926/sample_step2_10000/best_logistic_regression_model_random.pkl
Loaded Best Parameters: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.848035868435802}
Loaded Best Logistic Regression Model.


Best parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.0235896477251556}

In [29]:
print("Best Randomized Search parameters found: ", best_params)

Best Randomized Search parameters found:  {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 2.848035868435802}


In [30]:
from sklearn.model_selection import GridSearchCV

model = LogisticRegression()

#{'solver': 'saga', 'penalty': 'l2', 'max_iter': 2000, 'C': 2.1209508879201926}
param_grid = {
    'C': np.linspace(random_search.best_params_['C'] - 0.5, random_search.best_params_['C'] + 0.5, 1000),
    'solver': [random_search.best_params_['solver']], 
    'max_iter': [random_search.best_params_['max_iter']-100, random_search.best_params_['max_iter'], random_search.best_params_['max_iter']+100], 
    'penalty': [random_search.best_params_['penalty']]  
}

grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train_tfidf, y_train)

print(f"Best Parameters from GridSearchCV: {grid_search.best_params_}")

Fitting 3 folds for each of 3000 candidates, totalling 9000 fits




In [52]:
X_train_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_train_tfidf.pkl'))
X_test_tfidf = joblib.load(os.path.join(OUTPUT_DIR, 'X_test_tfidf.pkl'))
y_train = joblib.load(os.path.join(OUTPUT_DIR, 'y_train.pkl'))
y_test = joblib.load(os.path.join(OUTPUT_DIR, 'y_test.pkl'))

print("Full DataSet loaded successfully!")

Full DataSet loaded successfully!


In [53]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 83.01%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.83      0.83       474
           1       0.82      0.83      0.83       444

    accuracy                           0.83       918
   macro avg       0.83      0.83      0.83       918
weighted avg       0.83      0.83      0.83       918



In [54]:
model = LogisticRegression(**grid_search.best_params_)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f"Grid Search Model Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred))

Grid Search Model Accuracy: 82.68%
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83       474
           1       0.81      0.83      0.82       444

    accuracy                           0.83       918
   macro avg       0.83      0.83      0.83       918
weighted avg       0.83      0.83      0.83       918



In [55]:
best_params_path = os.path.join(OUTPUT_DIR, 'best_params_grid.pkl')
joblib.dump(grid_search.best_params_, best_params_path)

print(f"Best parameters saved at: {best_params_path}")

model = LogisticRegression(**grid_search.best_params_)

model_path = os.path.join(OUTPUT_DIR, 'best_logistic_regression_model_grid.pkl')
joblib.dump(model, model_path)

print(f"Best Logistic Regression model saved at: {model_path}")

Best parameters saved at: ../Output/textpreprocess/240926/sample_step2_5000/best_params_grid.pkl
Best Logistic Regression model saved at: ../Output/textpreprocess/240926/sample_step2_5000/best_logistic_regression_model_grid.pkl
