<a href="https://colab.research.google.com/github/HelmiAlf/Projects/blob/main/%5BNLP%5D_Aspect_Based_Sentiment_Analysis_using_semi_supervised_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Training data

In [None]:
import pandas as pd

In [None]:
df3 = pd.read_csv('(LABELLED) df_labelled_transformed.csv')

In [None]:
df3.shape

(3863, 2597)

In [None]:
X = df3
X.head()

Unnamed: 0,abang,abis,able,about,above,absolutely,abura,aburi,ac,acara,...,youre,yourself,yum,yumy,yung,zomato,food_pol,price_pol,ambience_pol,service_pol
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,-1,0,0


In [None]:
X_train = X.drop(['food_pol', 'price_pol', 'ambience_pol', 'service_pol'], axis=1)

In [None]:
X_train_copy = X_train.copy()

In [None]:
df4 = pd.read_csv('(UNLABELLED) df_unlabelled_transformed.csv')

In [None]:
df4.shape

(158923, 2593)

In [None]:
X_unlabeled = df4
X_unlabeled.head()

Unnamed: 0,abang,abis,able,about,above,absolutely,abura,aburi,ac,acara,...,yogurt,you,youl,your,youre,yourself,yum,yumy,yung,zomato
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_unlabeled_copy = X_unlabeled.copy()

In [None]:
y_final = pd.DataFrame()

## Reset train data for each label prediction

In [None]:
#Reset X_train and X_unlabeled

X_train = X_train_copy.copy()
X_unlabeled = X_unlabeled_copy.copy()

In [None]:
#Reset y_train for each label

y_train_food = X['food_pol']
y_train_price = X['price_pol']
y_train_ambi = X['ambience_pol']
y_train_serv = X['service_pol']

In [None]:
print('DATA DIMENSIONS')
print()
print('X_train shape: ', X_train.shape)
print('y_train_food:  ', y_train_food.shape)
print('y_train_price: ', y_train_price.shape)
print('y_train_ambi:  ', y_train_ambi.shape)
print('y_train_serv:  ', y_train_serv.shape)
print()
print('X_unlabeled:   ', X_unlabeled.shape)
print()
print('X_train copy:  ', X_train_copy.shape)
print('X_unlabeled_copy: ', X_unlabeled_copy.shape)

DATA DIMENSIONS

X_train shape:  (3863, 2593)
y_train_food:   (3863,)
y_train_price:  (3863,)
y_train_ambi:   (3863,)
y_train_serv:   (3863,)

X_unlabeled:    (158923, 2593)

X_train copy:   (3863, 2593)
X_unlabeled_copy:  (158923, 2593)


## Semi-supervised with Self-learning

Repeat code below for each y_train (food, price, ambience, service)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Initiate iteration counter
iterations = 0

# Containers to hold f1_scores and # of pseudo-labels
train_f1s = []
pseudo_labels = []

# Assign value to initiate while loop
high_prob = [1] 

# Loop will run until there are no more high-probability pseudo-labels
# or length of y_train exceed 6000 (we use 6000 as the limit of new y_train)
while len(high_prob) > 0 and len(y_train_ambi) < 6000:
        
    # Fit classifier and make train predictions
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train_ambi)
    y_hat_train = clf.predict(X_train)

    # Calculate and print iteration # and f1 scores, and store f1 scores
    train_f1 = f1_score(y_train_ambi, y_hat_train, average='micro')
    print(f"Iteration {iterations}")
    print(f"Train f1: {train_f1}")
    train_f1s.append(train_f1)
   
    # Generate predictions and probabilities for unlabeled data
    print(f"Now predicting labels for unlabeled data...")

    pred_probs = clf.predict_proba(X_unlabeled)
    preds = clf.predict(X_unlabeled)
    prob_0 = pred_probs[:,0]
    prob_1 = pred_probs[:,1]

    # Store predictions and probabilities in dataframe
    df_pred_prob = pd.DataFrame([])
    df_pred_prob['preds'] = preds
    df_pred_prob['prob_0'] = prob_0
    df_pred_prob['prob_1'] = prob_1
    df_pred_prob.index = X_unlabeled.index
    
    # Separate predictions with > 99% probability
    high_prob = pd.concat([df_pred_prob.loc[df_pred_prob['prob_0'] > 0.99],
                           df_pred_prob.loc[df_pred_prob['prob_1'] > 0.99]],
                          axis=0)
    
    print(f"{len(high_prob)} high-probability predictions added to training data.")
    
    pseudo_labels.append(len(high_prob))

    # Add pseudo-labeled data to training data
    X_train = pd.concat([X_train, X_unlabeled.loc[high_prob.index]], axis=0)
    y_train_ambi = pd.concat([y_train_ambi, high_prob.preds])      
    
    # Drop pseudo-labeled instances from unlabeled data
    X_unlabeled = X_unlabeled.drop(index=high_prob.index)
    print(f"{len(X_unlabeled)} unlabeled instances remaining.\n")
    
    # Update iteration counter
    iterations += 1

Iteration 0
Train f1: 0.989645353352317
Now predicting labels for unlabeled data...
19089 high-probability predictions added to training data.
139834 unlabeled instances remaining.



In [None]:
y_train_ambi.shape

(22952,)

In [None]:
#Take 6000 rows from new y_train

y_new = y_train_ambi[0:6000]

Reset index for new y_train

In [None]:
y_new.index.is_unique

False

In [None]:
y_new = y_new.reset_index()

In [None]:
# delete old index of y_train
del y_new['index']

In [None]:
y_new.shape

(6000, 1)

In [None]:
y_final['ambi'] = y_new
y_final.head()

Unnamed: 0,ambi
0,1
1,1
2,0
3,0
4,0


In [None]:
final = y_final.replace([1],'POSITIVE')
final = final.replace([0],'-')
final = final.replace([-1],'NEGATIVE')

In [None]:
final.shape

(6270, 4)

In [None]:
final.to_csv('final.csv')