# Classify Candidate Pairs of Acronyms and Expansions (Assignment 2)

## 1. Import Library

In [1]:
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler


## 2. Prepare Dataset

### 2.1 extract dataset

In [6]:
with tarfile.open("dataacro.tar.gz", "r:gz") as tar:
    tar.extractall("dataacro")

### 2.2 Load dataset

#### 2.2.1 Define extract dataset function

In [2]:
def extract_feature(list_doc):
    X=[]
    y=[]
    for i in range(len(list_doc)):
        lines_feature=list_doc[i].split(" ")[-8:len(list_doc[i])]
        line_label=list_doc[i].split(" ")[-9]
        list_features=[float(line.strip().split(":")[1]) for line in lines_feature]
        list_label=int(line_label) 
        X.append(list_features)
        y.append(list_label)
    
    return X,y  

def extract_text(list_doc):
    X=[]
    y=[]
    for line in list_doc:
        text_feature=line.split(" ")[:-9]
        text_feature=" ".join(text_feature)
        line_label=line.split(" ")[-9]
        
        X.append(text_feature)
        y.append(line_label)
    
    return X

#### 2.2.2 Load and Preprocessing Data

In [4]:


with open("dataacro/trainingset.txt", "r") as file:
    training_lines = file.readlines()

with open("dataacro/testingset.txt", "r") as file:
    testing_lines = file.readlines()

  


X_train,y_train=extract_feature(training_lines)
X_test,y_test=extract_feature(testing_lines)   

X_train=np.array(X_train)
y_train=np.array(y_train)


X_test=np.array(X_test)
y_test=np.array(y_test)

#replace -1 with 0
y_train = np.where(y_train == -1, 0, y_train)
y_test = np.where(y_test == -1, 0, y_test)



X_train_text= extract_text(training_lines)
X_train_text=np.array(X_train_text)
X_train_text=X_train_text.reshape(-1,1)
X_test_text=extract_text(testing_lines)
X_test_text=np.array(X_test_text)
X_test_text=X_test_text.reshape(-1,1)
#check data length
print("Training data length: ", len(X_train))
print("Training data text length: ", len(X_train_text))
print("Training label length: ", len(y_train))
print("Testing data length: ", len(X_test))
print("Testing data text length: ", len(X_test_text))
print("Testing label length: ", len(y_test))


Training data length:  4000
Training data text length:  4000
Training label length:  4000
Testing data length:  1099
Testing data text length:  1099
Testing label length:  1099


#### 2.2.3 Describe Data

In [20]:
col_name=['fitur '+str(i+1) for i in range(X_train.shape[1])]
df_train=pd.concat([pd.DataFrame(X_train,columns=col_name),pd.DataFrame(y_train,columns=['label'])],axis=1)

print(df_train.head().to_markdown())

|    |   fitur 1 |   fitur 2 |   fitur 3 |   fitur 4 |   fitur 5 |   fitur 6 |   fitur 7 |   fitur 8 |   label |
|---:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|----------:|--------:|
|  0 |  0.918296 |  1        | -0.666667 |       0   |      1    |  0.5      |         0 | 0.39309   |       0 |
|  1 |  1        |  0.5      | -2        |       0   |      0.75 |  0        |         0 | 0.0357143 |       0 |
|  2 |  0.970951 |  1        | -1        |       0.5 |      1    |  0.333333 |         0 | 0.400612  |       0 |
|  3 |  1        |  0.75     | -2        |       0   |      1    |  1        |         1 | 0.392857  |       0 |
|  4 |  0.970951 |  0.666667 | -2.5      |       0   |      1    |  0        |         0 | 0.0196596 |       0 |


In [21]:
col_name=['fitur '+str(i+1) for i in range(X_train_text.shape[1])]
df_train_text=pd.concat([pd.DataFrame(X_train_text,columns=col_name),pd.DataFrame(y_train,columns=['label'])],axis=1)

print(df_train_text.head().to_markdown())

|    | fitur 1                          |   label |
|---:|:---------------------------------|--------:|
|  0 | BUMD=>Usaha Milik                |       0 |
|  1 | TNI=>meminjam senjata dari oknum |       0 |
|  2 | PKI=>Panitia Pengawas            |       0 |
|  3 | MA=>putusan Mahkamah             |       0 |
|  4 | TI=>com Mati body                |       0 |


## 3. Build Model

In [14]:
eval_model_df_training=pd.DataFrame
eval_model_df=pd.DataFrame()

def input_model_result(df,y_true,y_pred,model_name):
    cm=confusion_matrix(y_true,y_pred)
    pre=precision_score(y_true,y_pred)
    rec=recall_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred)
    
    data={
            'Model':model_name,
            "Confusion Matrix": [cm.tolist()],  # Convert to list to avoid issues
            "Precision": [pre],
            "Recall": [rec],
            "F1-Score": [f1]
        }
    if df.empty:
        df = pd.DataFrame([data])
    else:

        if model_name in df['Model'].values:
            print("Model already exists in the dataframe")
        else:
            df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
        
        print("added new result")
    return df
    

### 3.1 SVM Classifier

In [None]:
svm = SVC(random_state=0)

x = [100.0,500.0]
y = [0.1,0.2,0.3]
z = [2,3S]    
parameters=[#{'C': x,'kernel': ['linear']},
            #{'C': x,'kernel': ['rbf'],'gamma': y} ,
            {'C': x,'kernel': ['poly'],'gamma': y,'degree': z}
           ]
grid=GridSearchCV(estimator = svm,
                        param_grid = parameters,
                        scoring='f1',
                        cv=10,
                        n_jobs=-1)
grid=grid.fit(X_train,y_train)
svm= grid.best_estimator_
best_params=grid.best_params_
best_score=grid.best_score_
print(f"The best parameters are {grid.best_params_} with" +
          f"a score of {grid.best_score_:.2f}")

The best parameters are {'C': 500.0, 'degree': 2, 'gamma': 0.2, 'kernel': 'poly'} witha score of 0.99


The best parameters are {'C': 500.0, 'degree': 2, 'gamma': 0.2, 'kernel': 'poly'} witha score of 0.99

In [17]:
y_pred= svm.predict(X_test)
eval_model_df = input_model_result(eval_model_df, y_test, y_pred,model_name='SVM-Polynomial')


In [18]:
eval_model_df

Unnamed: 0,Model,Confusion Matrix,Precision,Recall,F1-Score
0,SVM-Polynomial,"[[[498, 2], [29, 570]]]",[0.9965034965034965],[0.9515859766277128],[0.9735269000853971]


### 3.2 KNN Classifier 

In [26]:
## build model
knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(2, 10)}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(knn, param_grid=param_grid,scoring="recall",cv=cv)
grid.fit(X_train, y_train)

print(f"The best parameters are {grid.best_params_} with" +
          f"a score of {grid.best_score_:.2f}")

knn = grid.best_estimator_

knn.fit(X_train, y_train)


The best parameters are {'n_neighbors': np.int64(9)} witha score of 0.86


In [29]:
y_pred = knn.predict(X_train)
y_true = y_train
eval_model_df = input_model_result(eval_model_df, y_true, y_pred,model_name='knn')
report=classification_report(y_true,y_pred)

Model already exists in the dataframe
added new result


### 3.3 Naive Bayes Classifier 

In [34]:
nb=GaussianNB()
nb.fit(X_train,y_train)


In [None]:
y_pred=nb.predict(X_train)
y_true=y_train

eval_model_df = input_model_result(eval_model_df, y_true, y_pred,model_name='Naive Bayes')


              precision    recall  f1-score   support

          -1       0.96      0.98      0.97      2000
           1       0.98      0.96      0.97      2000

    accuracy                           0.97      4000
   macro avg       0.97      0.97      0.97      4000
weighted avg       0.97      0.97      0.97      4000



### 3.4 Decision Tree

In [38]:
dt=tree.DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [39]:
y_pred=dt.predict(X_test)
y_true=y_test
print(classification_report(y_true,y_pred))

eval_model_df = input_model_result(eval_model_df, y_true, y_pred,model_name='Decision Tree')

              precision    recall  f1-score   support

          -1       0.90      0.99      0.95       500
           1       0.99      0.91      0.95       599

    accuracy                           0.95      1099
   macro avg       0.95      0.95      0.95      1099
weighted avg       0.95      0.95      0.95      1099

added new result


### 3.5 BERT

In [23]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import numpy as np





In [24]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx], 
            truncation=True, 
            padding="max_length", 
            max_length=self.max_length, 
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [25]:
# If X_train_text is a NumPy array
if (type(X_train_text) != list):
    X_train_text = X_train_text.tolist()  # Convert numpy array to list

 # Convert to list of strings

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = TextDataset(X_train_text, y_train, tokenizer)
test_dataset = TextDataset(X_test_text, y_test, tokenizer)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
training_args = TrainingArguments(
    output_dir="./results",      
    num_train_epochs=3,          
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,            
    weight_decay=0.01,           
    logging_dir="./logs",        
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

bert = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

bert.train()

  bert = Trainer(


  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 0.7091, 'grad_norm': 5.335214138031006, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.02}
{'loss': 0.6847, 'grad_norm': 6.239068031311035, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.04}
{'loss': 0.6928, 'grad_norm': 5.495453357696533, 'learning_rate': 3e-06, 'epoch': 0.06}
{'loss': 0.7023, 'grad_norm': 5.048561096191406, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.08}
{'loss': 0.689, 'grad_norm': 3.8396718502044678, 'learning_rate': 5e-06, 'epoch': 0.1}
{'loss': 0.699, 'grad_norm': 2.9920356273651123, 'learning_rate': 6e-06, 'epoch': 0.12}
{'loss': 0.6911, 'grad_norm': 4.778266429901123, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.14}
{'loss': 0.6872, 'grad_norm': 6.9926066398620605, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.16}
{'loss': 0.6481, 'grad_norm': 8.927323341369629, 'learning_rate': 9e-06, 'epoch': 0.18}
{'loss': 0.6988, 'grad_norm': 3.699111223220825, 'learning_rate': 1e-05, 'epoch': 0.2}
{'loss': 0.6444, 'grad_norm': 7.6320233

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Get model predictions
predictions = bert.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Print classification report
print(classification_report(y_test, pred_labels, target_names=["0", "1"]))