In [66]:
import nltk
import numpy as np
import pandas as pd
from utils import process_tweet
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import torch
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

In [67]:
# download the dataset from nltk
nltk.download('twitter_samples')

# stop words are common words that we don't want to include in our features
nltk.download('stopwords')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\pc1\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
# read train/dev sets
df = pd.read_csv('train.csv')
df_test = pd.read_csv('dev.csv')

train_x = df['text'].tolist() 
test_x =df_test['text'].tolist()

train_y1 = df['stance'].tolist()
test_y1 = df_test['stance'].tolist()

train_y2 = df['category'].tolist()
test_y2 = df_test['category'].tolist()

# Print the shape train and test sets
print("train_x.shape = " + str(len(train_x)))
print("test_x.shape = " + str(len(test_x)))
print("train_y.shape = " + str(len(train_y1)))
print("test_y.shape = " + str(len(test_y1)))
df.head()

train_x.shape = 6988
test_x.shape = 1000
train_y.shape = 6988
test_y.shape = 1000


Unnamed: 0,text,category,stance
0,بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الاب...,celebrity,1
1,وزير الصحة لحد اليوم وتحديدا هلأ بمؤتمروا الصح...,info_news,1
2,قولكن رح يكونو اد المسؤولية ب لبنان لما يوصل ...,info_news,1
3,#تركيا.. وزير الصحة فخر الدين قوجة يتلقى أول ج...,celebrity,1
4,وئام وهاب يشتم الدول الخليجية في كل طلة اعلامي...,personal,0


In [69]:
#process tweets
def process_tweets(x):
    X = []
    for tweet in x:
        X.append(process_tweet(tweet))
    return X

In [70]:
#get unique tokens and thier frequancies in all tweets
def get_tokens(X):
    tokens = {}
    for tweet in X:
        for token in tweet:
            if token not in tokens.keys():
                tokens[token] = 0

    #copy = tokens.copy()
    #for key,val in copy.items():
    #    if val<4: tokens.pop(key)
    return tokens

In [71]:
train_input = process_tweets(train_x)
print(train_x[0])
print(train_input[0])

بيل غيتس يتلقى لقاح #كوفيد19 من غير تصوير الابرة و لا السيرنجة و لا الدواء و لابس بولو صيفي في عز الشتاء و يقول ان إحدى مزايا عمر ال 65 عامًا هي انه مؤهل للحصول على اللقاح ... يعنى ما كان يحتاج اللقاح لو كان عمره اصغر من 65 🤔 https://t.co/QQKFFUNwBn
['بيل', 'غيتس', 'لقى', 'لقح', 'كوفيد', '19', 'صور', 'ابر', 'رنج', 'دوء', 'لبس', 'ولو', 'صيف', 'عز', 'شتء', 'يقل', 'ان', 'مزا', 'عمر', 'ال', '65', 'عما', 'انه', 'ؤهل', 'حصل', 'لقح', 'عنى', 'حاج', 'لقح', 'عمر', 'صغر', '65', '🤔']


In [72]:
Unique_tokens = get_tokens(train_input)
j = 0
for i in Unique_tokens.keys():
  Unique_tokens[i] = j
  j+=1
num_tokens = len(Unique_tokens)
for i in train_input:
  for index,token in enumerate(i):
    i[index] = Unique_tokens[token]
len(Unique_tokens)

12194

In [73]:
class RNNDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the RNNDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    ##################### create two tensors one for x and the other for labels ###############################
    max_len = max([len(i) for i in x])    
    pad_size = [max_len - len(i) for i in x]
    x = [ ( j + [pad] * pad_size[i])[:max_len] for i,j in enumerate(x)]  
    self.inputs = torch.tensor(x)
    self.lables = torch.tensor(y)
    inds = [self.lables == -1]
    self.lables[inds] = 2
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    ###################### return the length of the dataset #############################
    return len( self.inputs )
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    ###################### return a tuple of x and y ###################################
    return ( self.inputs[idx],self.lables[idx])
    ##########################################################################################

In [74]:
dummy_dataset = RNNDataset(train_input, train_y1, len(Unique_tokens))
dummy_dataloader = torch.utils.data.DataLoader(dummy_dataset, batch_size=512)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0])
len(dummy_dataset)

torch.Size([512]) torch.Size([512, 113]) torch.Size([512]) torch.Size([512, 113])
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,    15,    16,    17,    18,    19,
           20,    21,    22,    23,    24,     3,    25,    26,     3,    18,
           27,    20,    28, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194, 12194,
        12194, 12194, 12194]) 
 tensor(1)


6988

In [75]:
class RNN(nn.Module):
  def __init__(self, vocab_size=13000, embedding_dim=50, hidden_size=50, n_classes=3):
    
    """
    The constructor of our RNN model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes 
    """
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.n_classes = n_classes
    ####################### Create the layers of your model #######################################
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size,embedding_dim)

    # (2) Create an RNN layer with hidden size = hidden_size and batch_first = True
    self.rnn = nn.RNN(input_size = embedding_dim,hidden_size = hidden_size,batch_first = True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size,n_classes)
    #####################################################################################################

  def forward(self, sentences):

    final_output = None
    ######################### implement the forward pass ####################################
    h_0 = torch.zeros(1,sentences.size(0), self.hidden_size)
    #print(sentences.shape)
    embedding_out = self.embedding(sentences)
    
    rnn_out,_ = self.rnn(embedding_out, h_0)
    rnn_out = rnn_out[:, -1,:]
    
    final_output = self.linear(rnn_out)
    ###############################################################################################
    return final_output

In [76]:
model = RNN()
model

RNN(
  (embedding): Embedding(13000, 50)
  (rnn): RNN(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=3, bias=True)
)

In [77]:
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type RNNDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
    
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size,shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(),learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for train_input, train_label in tqdm(train_dataloader):
      #try:
      #  train_input = train_input.reshape(-1,104,512)
      #except:
      #  continue
     
      # (4) move the train input to the device
      train_label = train_label.to(device)

      # (5) move the train label to the device
      train_input = train_input.to(device)

      # (6) do the forward pass
      output = model(train_input)
      
      # (7) loss calculation 
      batch_loss = criterion(output, train_label)
      
      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss
      
      # (9) calculate the batch accuracy

      _,pred = torch.max(output,1)
      acc = (pred == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      model.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = 100 * total_acc_train / len(train_dataset)

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [78]:
indecies = {'info_news':0, 'celebrity':1, 'plan': 2, 'requests': 3, 'rumors': 4, 'advice': 5, 'restrictions': 6, 'personal': 7,'unrelated': 8,'others': 9}
train_y2_indecies = [indecies[k] for k in train_y2]
test_y2_indecies = [indecies[k] for k in test_y2]

In [79]:
dataset = RNNDataset(train_input, train_y1, len(Unique_tokens))

In [80]:
model = RNN(n_classes=3)
model
train(model, dataset)

100%|██████████| 14/14 [00:08<00:00,  1.69it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 1 | Train Loss: 0.0013335070107132196         | Train Accuracy: 79.25014310246137



100%|██████████| 14/14 [00:06<00:00,  2.10it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 2 | Train Loss: 0.0012880577705800533         | Train Accuracy: 79.25014310246137



100%|██████████| 14/14 [00:07<00:00,  1.97it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 3 | Train Loss: 0.0012830737978219986         | Train Accuracy: 79.25014310246137



100%|██████████| 14/14 [00:07<00:00,  1.93it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 4 | Train Loss: 0.0012778631644323468         | Train Accuracy: 79.25014310246137



100%|██████████| 14/14 [00:08<00:00,  1.57it/s]

Epochs: 5 | Train Loss: 0.0012811814667657018         | Train Accuracy: 79.25014310246137






In [81]:
test_input = process_tweets(test_x)
Unique_tokens = get_tokens(test_input)
j = 0
for i in Unique_tokens.keys():
  Unique_tokens[i] = j
  j+=1
num_tokens2 = len(Unique_tokens)
for i in test_input:
  for index,token in enumerate(i):
    i[index] = Unique_tokens[token]


In [82]:
dataset2 = RNNDataset(train_input, train_y2_indecies, num_tokens)

In [83]:
model2 = RNN(n_classes=10)
model2
train(model2, dataset2)

100%|██████████| 14/14 [00:07<00:00,  1.76it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 1 | Train Loss: 0.0032378770411014557         | Train Accuracy: 49.06983400114482



100%|██████████| 14/14 [00:08<00:00,  1.62it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 2 | Train Loss: 0.0030593248084187508         | Train Accuracy: 51.745850028620495



100%|██████████| 14/14 [00:10<00:00,  1.35it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 3 | Train Loss: 0.0030576493591070175         | Train Accuracy: 51.745850028620495



100%|██████████| 14/14 [00:13<00:00,  1.03it/s]
  0%|          | 0/14 [00:00<?, ?it/s]

Epochs: 4 | Train Loss: 0.003054589033126831         | Train Accuracy: 51.745850028620495



100%|██████████| 14/14 [00:12<00:00,  1.09it/s]

Epochs: 5 | Train Loss: 0.0030522779561579227         | Train Accuracy: 51.745850028620495






In [84]:
def evaluate(model, test_dataset, batch_size=512):
  """
  This function takes a RNN model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a RNN model
  - test_dataset: dataset of type RNNDataset
  """
  
  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)

      # accuracy calculation
      _,pred = torch.max(output,1)
      acc = (pred == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= len(test_dataset)
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [85]:
test_dataset = RNNDataset(test_input, test_y1, num_tokens)
evaluate(model, test_dataset)

100%|██████████| 2/2 [00:00<00:00, 18.19it/s]


Test Accuracy: 0.804





In [86]:
test_dataset2 = RNNDataset(test_input, test_y2_indecies, num_tokens2)
evaluate(model2, test_dataset2)

100%|██████████| 2/2 [00:00<00:00, 26.33it/s]


Test Accuracy: 0.544





tf_idf

In [87]:
def get_tf_idf(documents,tokens):
    documents_terms = np.zeros((len(documents),len(tokens)))
    dft = np.zeros(len(tokens))
    for j,document in enumerate(documents):
        for i,token in enumerate(tokens.keys()):
            if token in document:
                dft[i] += 1
            documents_terms[j][i] = np.log10(documents[j].count(token)+1)
    idf = np.zeros(len(tokens))
    for i,j in enumerate(dft):
        if j == 0: idf[i] = 0
        else: idf[i] = np.log10(len(documents)/j)
    tf_idf = documents_terms.copy()
    for i,document in enumerate(tf_idf):
        tf_idf[i] = document*idf
    return tf_idf

In [88]:
X = process_tweets(train_x)
tokens = get_tokens(X)
print(len(tokens))
tf_idf = get_tf_idf(X,tokens)
print(tf_idf.shape)

12194
(6988, 12194)


logistic regression

In [89]:
def train_lr(X, Y):
    
    lr = LogisticRegression(max_iter=10000)
    
    ################################# train the lr model #####################
    lr.fit(X,Y)
    ###############################################################################################################
    
    return lr

In [90]:
lr = train_lr(tf_idf,train_y1)
lr2 = train_lr(tf_idf,train_y2)

In [91]:
def predict(clf, X):
   
    Y_pred = None
    ######################### predict labels ############################
    Y_pred = clf.predict(X)
    #######################################################################################################################
    return Y_pred

In [92]:
test_x = np.array(test_x)
print(test_x.shape)
processed_tweets = process_tweets(test_x)
print(len(processed_tweets))
X_test = get_tf_idf(processed_tweets,tokens)
print(X_test.shape)
Y_pred1 = predict(lr, X_test)
print(Y_pred1.shape)
Y_pred2 = predict(lr2, X_test)
print(Y_pred2.shape)

(1000,)
1000
(1000, 12194)
(1000,)
(1000,)


In [93]:
from sklearn.metrics import classification_report
test_y1 = np.array(test_y1)
print(test_y1.shape)
print(classification_report(test_y1, Y_pred1))

(1000,)
              precision    recall  f1-score   support

          -1       0.55      0.16      0.24        70
           0       0.40      0.32      0.36       126
           1       0.86      0.94      0.90       804

    accuracy                           0.81      1000
   macro avg       0.60      0.47      0.50      1000
weighted avg       0.78      0.81      0.78      1000



In [94]:
test_y2 = np.array(test_y2)
print(test_y2.shape)
print(classification_report(test_y2, Y_pred2))

(1000,)
              precision    recall  f1-score   support

      advice       0.00      0.00      0.00        10
   celebrity       0.86      0.79      0.82       145
   info_news       0.69      0.87      0.77       545
      others       0.25      0.06      0.10        17
    personal       0.52      0.51      0.52       128
        plan       0.24      0.09      0.13        82
    requests       0.33      0.05      0.09        20
restrictions       0.00      0.00      0.00         2
      rumors       0.00      0.00      0.00        15
   unrelated       0.56      0.25      0.35        36

    accuracy                           0.67      1000
   macro avg       0.35      0.26      0.28      1000
weighted avg       0.62      0.67      0.63      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
df_test2 = pd.read_csv('test.csv')
test2_x =df_test2['text'].tolist()
test2_x = np.array(test2_x)
print(test2_x.shape)
processed_tweets = process_tweets(test2_x)
print(len(processed_tweets))
X_test2 = get_tf_idf(processed_tweets,tokens)
print(X_test2.shape)
Y2_pred1 = predict(lr, X_test2)
print(Y_pred1.shape)
Y2_pred2 = predict(lr2, X_test2)
print(Y_pred2.shape)

(2000,)
2000
(2000, 12194)
(1000,)
(1000,)


In [96]:
df_test2['category'] = Y2_pred2
df_test2['stance'] = Y2_pred1
df_test2.to_csv('sub.csv')