# Classify Candidate Pairs of Acronyms and Expansions (Assignment 2)

## 1. Import Library

In [48]:
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

from sklearn.preprocessing import StandardScaler


## 2. Prepare Dataset

### 2.1 extract dataset

In [5]:
with tarfile.open("dataacro.tar.gz", "r:gz") as tar:
    tar.extractall("dataacro")

### 2.2 Load dataset

In [52]:
def extract_feature(list_doc):
    X=[]
    y=[]
    for i in range(len(list_doc)):
        lines_feature=list_doc[i].split(" ")[-8:len(list_doc[i])]
        line_label=list_doc[i].split(" ")[-9]
        list_features=[float(line.strip().split(":")[1]) for line in lines_feature]
        list_label=int(line_label) 
        X.append(list_features)
        y.append(list_label)
    
    return X,y  

with open("dataacro/trainingset.txt", "r") as file:
    training_lines = file.readlines()

with open("dataacro/testingset.txt", "r") as file:
    testing_lines = file.readlines()

  


X_train,y_train=extract_feature(training_lines)
X_test,y_test=extract_feature(testing_lines)   

X_train=np.array(X_train)
y_train=np.array(y_train)
X_test=np.array(X_test)
y_test=np.array(y_test)
#check data length
print("Training data length: ", len(X_train))
print("Training label length: ", len(y_train))
print("Testing data length: ", len(X_test))
print("Testing label length: ", len(y_test))


Training data length:  4000
Training label length:  4000
Testing data length:  1099
Testing label length:  1099


## 3. Build Model

In [43]:
def input_model_result(y_true,y_pred):
    cm=confusion_matrix(y_true,y_pred)
    pre=precision_score(y_true,y_pred)
    rec=recall_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred)
    
    df = pd.DataFrame({
        "Confusion Matrix": [cm.tolist()],  # Convert to list to avoid issues
        "Precision": [pre],
        "Recall": [rec],
        "F1-Score": [f1]
    })
    return df
    

### 3.X SVM Classifier

In [None]:
svm = SVC(random_state=0)

x = [1.0,10.0,100.0,500.0,1000.0]
y = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
z = [2,3,4]    
parameters=[{'C': x,'kernel': ['linear']},
            {'C': x,'kernel': ['rbf'],'gamma': y} ,
            {'C': x,'kernel': ['poly'],'gamma': y,'degree': z}
           ]
grid=GridSearchCV(estimator = svm,
                        param_grid = parameters,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1)
grid=grid.fit(X_train,y_train)

print(f"The best parameters are {grid.best_params_} with" +
          f"a score of {grid.best_score_:.2f}")

### 3.x KNN Classifier 

In [None]:
## build model
knn = KNeighborsClassifier()
param_grid = {"n_neighbors": np.arange(2, 10)}
grid = GridSearchCV(knn, param_grid=param_grid)
grid.fit(X_train, y_train)

print(f"The best parameters are {grid.best_params_} with" +
          f"a score of {grid.best_score_:.2f}")

knn = grid.best_estimator_

knn.fit(X_train, y_train)


The best parameters are {'n_neighbors': np.int64(9)} witha score of 0.92


Unnamed: 0,Confusion Matrix,Precision,Recall,F1-Score
0,"[[497, 3], [305, 294]]",0.989899,0.490818,0.65625


In [None]:
yhat = knn.predict(X_test)
input_model_result(y_test,yhat)
print(classification_report(y_test,yhat))

              precision    recall  f1-score   support

          -1       0.62      0.99      0.76       500
           1       0.99      0.49      0.66       599

    accuracy                           0.72      1099
   macro avg       0.80      0.74      0.71      1099
weighted avg       0.82      0.72      0.71      1099

