# SWE3011_41 Task1

**Supervised Text Classification using traditional machine learning methods**

1. Complete all the functions given.
2. Conduct various experiments including hyper-parameter tuning, cross validation, etc.
3. Write a report on the analysis of experiment results.  


**0. Installation**

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


**1. Load Dataset**


Evaluation should be done using **provided test dataset**

In [None]:
from datasets import load_dataset

train_ds = load_dataset("glue", "sst2", split="train")

# Evaluation should be done using test_ds
test_ds = load_dataset("csv", data_files="./test_dataset.csv")['train']

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

**2. Preparing Dataset**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def transform_data(X_train, X_test):
    vectorizer = TfidfVectorizer()
    X_test_tfidf = vectorizer.fit_transform(X_test)
    X_train_tfidf = vectorizer.transform(X_train)

    return X_train_tfidf, X_test_tfidf, vectorizer

In [None]:
X_train, y_train = train_ds['sentence'], train_ds['label']
X_test, y_test = test_ds['sentence'], test_ds['label']
X_train_tfidf, X_test_tfidf, vectorizer = transform_data(X_train, X_test)

**3. Train**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
def logistic_regression(X_train_tfidf, y_train):

    m_ver1= LogisticRegression(max_iter=1000,solver='saga',penalty='l1').fit(X_train_tfidf, y_train)
    m_ver2= LogisticRegression(max_iter=1000,solver='lbfgs',penalty='l2').fit(X_train_tfidf, y_train)
    m_ver3= LogisticRegression(max_iter=5000,solver='saga',penalty='l1').fit(X_train_tfidf, y_train)
    m_ver4= LogisticRegression(max_iter=5000,solver='lbfgs',penalty='l2').fit(X_train_tfidf, y_train)
    m_ver5= LogisticRegression(max_iter=10000,solver='saga',penalty='l1').fit(X_train_tfidf, y_train)
    m_ver6= LogisticRegression(max_iter=10000,solver='lbfgs',penalty='l2').fit(X_train_tfidf, y_train)


    models = [m_ver1, m_ver2, m_ver3, m_ver4, m_ver5, m_ver6]
    results = []
    max_score=0
    for model in models: # find best model
        score = model.score(X_test_tfidf,y_test)
        results.append(score)
        if max_score <= score:
          clf = model

    return clf, results

In [None]:
def random_forest(X_train_tfidf, y_train):

    clf = RandomForestClassifier(n_estimators=500, random_state=0,verbose=1)  # 예시로 100개의 결정 트리를 사용
    clf.fit(X_train_tfidf, y_train)

    return clf

In [None]:
def naive_bayes_classifier(X_train_tfidf, y_train):
    nb_model = MultinomialNB()

    hyper_parameters ={
        'alpha' : [0.0, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0] #Laplace smoothing
    }

    models = [GridSearchCV(nb_model, hyper_parameters).fit(X_train_tfidf, y_train), MultinomialNB().fit(X_train_tfidf, y_train)]

    max_score=0
    for model in models: # find best model
        if max_score <= model.score(X_test_tfidf,y_test) :
          clf = model

    return clf

In [None]:
clf = logistic_regression(X_train_tfidf, y_train)

In [None]:
clf_rf = random_forest(X_train_tfidf, y_train)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  1.1min
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  4.7min
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed: 10.4min


In [None]:
clf_nb = naive_bayes_classifier(X_train_tfidf, y_train)



**4. Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def evaluate_model(clf, X_test_tfidf, y_test):

    y_pred = clf.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
def evaluate_model_nlp(clf, y_pred, y_test):

    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
evaluate_model(clf, X_test_tfidf, y_test)

Accuracy: 0.76
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76        54
           1       0.70      0.83      0.76        46

    accuracy                           0.76       100
   macro avg       0.76      0.76      0.76       100
weighted avg       0.77      0.76      0.76       100



In [None]:
evaluate_model(clf_rf, X_test_tfidf, y_test)

Accuracy: 0.73
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.67      0.73        54
           1       0.67      0.80      0.73        46

    accuracy                           0.73       100
   macro avg       0.74      0.74      0.73       100
weighted avg       0.74      0.73      0.73       100



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.1s


In [None]:
evaluate_model(clf_nb, X_test_tfidf, y_test)

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.74      0.76        54
           1       0.71      0.76      0.74        46

    accuracy                           0.75       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.75      0.75      0.75       100

