In [1]:
# load text data and convert the label/sentiment into corresponding numeric values: 
# possible packages you might need are: pandas, numpy
import pandas as pd
import numpy as np
# read the training data
fname='facebook_comments.csv'
df_train=pd.read_csv(fname,header=None,names=['text','sentiment'],encoding='iso-8859-1',lineterminator='\n')
sentiment_dict={'positive':2,'neutral':1,'negative':0}
df_train['labels']=df_train['sentiment'].str.strip().map(sentiment_dict)
# get texts and labels
training_texts=df_train.text.values
labels=df_train.labels.values
# show the first 5 records
df_train.head()


Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


In [2]:
#Preprocess dat
# preprocess the loaded textual data, including removing stopwords, stemming, and tok
# represent each document (i.e., comment) using TF-IDF strategy. The features are the
# possible packages you might need are: scikit-learn, numpy
from sklearn.feature_extraction.text import TfidfVectorizer
# tokenize and create a document-feature matrix X and a label vector Y
vectorizer=TfidfVectorizer(stop_words='english',max_features=1000)
instances=vectorizer.fit_transform(training_texts)
X=instances
Y=np.array(labels)

# print out the shape of X and Y
print(X.shape,',',Y.shape)

(1999, 1000) , (1999,)


#Traditional Machine Learning Models: Random Forest


In [3]:
#Random Forest - mean: 64.1332% (std: +/- 2.0919%)
# using 10-fold cross-validation to show the prediction accuracy
# possible packages you might need are: scikit-learn, numpy

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold=KFold(n_splits=10,shuffle=True,random_state=2020)
rf_model=RandomForestClassifier(random_state=2020,max_depth=2,criterion='entropy')
rf_cvscores=[]

for train,test in kfold.split(X):
  rf_model.fit(X[train],Y[train])
  rf_acc=rf_model.score(X[test],Y[test])
  rf_cvscores.append(rf_acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%)" % (np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))

Random Forest - mean: 64.1332% (std: +/- 2.0919%)


#Fully connected feedforward Neural Network


In [4]:
# Design your own network with the following requirements:
# 1. Having dropout
# 2. Separate the dataset into training and validation (80-20%)
# 3. The prediction accuracy on the validation set should be at least 50% for this 3-
# possible packages you might need are: scikit-learn, numpy, torch
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim


Build the train loader and validation loader





In [5]:
# convert your numpy array to TensorDataset and create a data loader for training and
# some hyperparameters: input dimension, output dimension, batch size, number of epoch
epochs = 15
lr = 1e-3
indim = X.shape[1]
outdim = 3
drate = 0.6
batch_size = 16


X_tensor=torch.from_numpy(X.toarray())
Y_tensor=torch.from_numpy(Y)

dataset=TensorDataset(X_tensor,Y_tensor)
train_size=int(0.8*len(dataset))
validation_size=len(dataset)-train_size
train_dataset,validation_dataset=torch.utils.data.random_split(dataset,[train_size,validation_size])

#create training loader and validation loader
train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(validation_dataset,batch_size=batch_size,shuffle=True)




In [6]:
# create your model/network
class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):
    
    super(SentimentNetwork,self).__init__()
    self.fc1 = nn.Linear(input_dim,100)
    self.dropout = nn.Dropout(dropout_rate) 
    self.fc2 = nn.Linear(100,50)
    self.fc3 = nn.Linear(50,output_dim)

    
  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.dropout(x)
    x=F.relu(self.fc2(x))
    x=F.log_softmax(self.fc3(x))
    return x

# create a model
model = SentimentNetwork(indim,outdim,drate)
print(model)


SentimentNetwork(
  (fc1): Linear(in_features=1000, out_features=100, bias=True)
  (dropout): Dropout(p=0.6, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
)


Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set

In [7]:
# define a training process function

def train(model, train_loader, optimizer, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
  correct_pre=0
  cum_los=0
  loss_values=[]
  # correct,log_interval=0,4
  model.train()
  total=0
  for i,(data, target) in enumerate(train_loader):   
    epoch_loss, epoch_acc = 0.0,0.0
    # Clearing gradients w.r.t. parameters
    optimizer.zero_grad()
    
    # Forward pass to get output
    output = model(data.float())

    # Calculating Loss using cross entropy 
    loss = criterion(output, target)       
    epoch_loss = loss.item()
    cum_los=cum_los+epoch_loss 

    # Back propagation
    loss.backward()

    # Updating parameters
    optimizer.step()
    epoch_acc = (output.argmax(1) == target).sum().item()
    correct_pre=correct_pre+epoch_acc
    total=total+len(data)   
  return cum_los/len(train_loader), correct_pre/total
   

## evaluation part
def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 # the loss and accuracy for each epoch
  correct_pre_val=0
  cum_los_val=0
  total_val=0
  model.eval()
  for data, target in val_loader:
    with torch.no_grad():
      # Forward pass to get output
      output = model(data.float())
      # Calculating Loss using cross entropy 
      epoch_loss = criterion(output, target).item()
      cum_los_val=cum_los_val+epoch_loss
      # Calculating accuracy
      epoch_acc = (output.argmax(1) == target).sum().item()
      correct_pre_val=correct_pre_val+epoch_acc  
      total_val=total_val+len(data)
  return cum_los_val/len(val_loader), correct_pre_val/total_val

   

Main starting point: train the model and evaluate the model


In [8]:
# define the loss function and optimizer
optimizer=optim.Adam(model.parameters(),lr=lr)
criterion=nn.CrossEntropyLoss()
# real training and evaluation process
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc:.4f}')

  app.launch_new_instance()


Epoch: 01
	Train Loss: 0.8761 | Train Acc: 0.6316
	 Val. Loss: 0.7840 |  Val. Acc: 0.6225
Epoch: 02
	Train Loss: 0.6372 | Train Acc: 0.6992
	 Val. Loss: 0.5479 |  Val. Acc: 0.7975
Epoch: 03
	Train Loss: 0.4249 | Train Acc: 0.8487
	 Val. Loss: 0.4098 |  Val. Acc: 0.8425
Epoch: 04
	Train Loss: 0.3251 | Train Acc: 0.8787
	 Val. Loss: 0.3606 |  Val. Acc: 0.8450
Epoch: 05
	Train Loss: 0.2672 | Train Acc: 0.8974
	 Val. Loss: 0.3139 |  Val. Acc: 0.8550
Epoch: 06
	Train Loss: 0.2183 | Train Acc: 0.9087
	 Val. Loss: 0.2630 |  Val. Acc: 0.8650
Epoch: 07
	Train Loss: 0.1847 | Train Acc: 0.9156
	 Val. Loss: 0.2248 |  Val. Acc: 0.8850
Epoch: 08
	Train Loss: 0.1433 | Train Acc: 0.9500
	 Val. Loss: 0.1903 |  Val. Acc: 0.9250
Epoch: 09
	Train Loss: 0.1068 | Train Acc: 0.9725
	 Val. Loss: 0.1737 |  Val. Acc: 0.9300
Epoch: 10
	Train Loss: 0.0879 | Train Acc: 0.9850
	 Val. Loss: 0.1506 |  Val. Acc: 0.9600
Epoch: 11
	Train Loss: 0.0767 | Train Acc: 0.9856
	 Val. Loss: 0.1536 |  Val. Acc: 0.9575
Epoch: 12
