<a href="https://colab.research.google.com/github/JYu89828/Airbnb-Kaggle-Challenge/blob/master/758B_Lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [299]:
import pandas as pd
import numpy as np

fname='facebook_comments.csv'

df_train=pd.read_csv(fname,header=None,names=['text','sentiment'],encoding='iso-8859-1', lineterminator='\n')

sent={'positive':2,'neutral':1,'negative':0}
df_train['labels']=df_train['sentiment'].str.strip().map(sent)

training_texts=df_train.text.values
labels=df_train.labels.values


df_train.head()

Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


In [300]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocess dat



In [301]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(stop_words='english',max_features=500, ngram_range=(1,2))

instances=vectorizer.fit_transform(training_texts)

X=instances
Y=labels

print(X.shape,',',Y.shape)


(1999, 500) , (1999,)


# Traditional Random Forest Approach

In [302]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold=KFold(n_splits=10,shuffle=True,random_state=2020)
rf_model=RandomForestClassifier(criterion='entropy',max_depth=2, random_state=2020)
rf_cvscores=[]

for train_idx, test_idx in kfold.split(X):
  rf_model.fit(X[train_idx],Y[train_idx])
  acc=rf_model.score(X[test_idx],Y[test_idx])
  rf_cvscores.append(acc)

print('Random Forest - mean: %.4f%%(std:+/- %.4f%%)' % (np.mean(rf_cvscores)*100,np.std(rf_cvscores)*100))

Random Forest - mean: 64.1332%(std:+/- 2.0919%)


# Fully connected feedforward Neural Network

In [303]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from torch.autograd import Variable




Build the train loader and validation 

In [304]:
epochs=5
lr=1e-4
indim=X.shape[1]
outdim=3
drate=0.7
batch_size=16

X_tensor=torch.from_numpy(X.toarray())
Y_tensor=torch.from_numpy(Y)

dataset=TensorDataset(X_tensor,Y_tensor)
train_size=int(0.8*len(dataset))
val_size=len(dataset)-train_size
train_dataset, val_dataset=torch.utils.data.random_split(dataset,[train_size,val_size])

train_loader=DataLoader(train_dataset,batch_size,shuffle=True)

val_loader=DataLoader(val_dataset, batch_size,shuffle=True)

In [305]:
class SentimentNetwork(nn.Module):
  def __init__(self,input_dim,output_dim,dropout_rate):
    super(SentimentNetwork,self).__init__()
    self.fc1=nn.Linear(input_dim,100)
    self.do1=nn.Dropout(dropout_rate)
    self.fc2=nn.Linear(100,50)
    self.fc3=nn.Linear(50,output_dim)

  def forward(self,x):
    x=F.relu(self.fc1(x))
    x=self.do1(x)
    x=F.relu(self.fc2(x))
    x=F.log_softmax(self.fc3(x))

    return x

  
model=SentimentNetwork(indim,outdim,drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=100, bias=True)
  (do1): Dropout(p=0.7, inplace=False)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=3, bias=True)
)


Create a training function to train the model and an evaluation function to evaluate the performance on the separate validation set 


In [306]:
import torch.optim as optim

criterion =  torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [318]:
def train(model,train_loader,optimizer, criterion):
  epoch_loss, epoch_acc=0.0,0.0
  model.train()
  for batch_x, batch_y in enumerate(train_loader):
    inputs, labels = batch_y


    optimizer.zero_grad() 
    model_out = model(inputs.float()) 

    loss = criterion(model_out, labels)
    acc=model_out.data.max(dim=1)[1].eq(labels.data).sum()
    loss.backward()
    optimizer.step() 

    epoch_loss+=loss.item()
    epoch_acc+=acc
  epoch_loss/=len(train_loader)


  return epoch_loss, epoch_acc
    

In [319]:
def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc=0.0,0.0
  model.eval()
  for batch_x, batch_y in enumerate(val_loader):
    inputs, labels = batch_y
    

    optimizer.zero_grad() 
    model_out = model(inputs.float()) 

    loss = criterion(model_out, labels)
    acc=model_out.data.max(dim=1)[1].eq(labels.data).sum()

    loss.backward()
    optimizer.step() 

    epoch_loss+=loss.item()
    epoch_acc+=acc
  epoch_loss/=len(val_loader)



  return epoch_loss, epoch_acc
    


Main starting point: train the model and evaluate the model

In [320]:
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    
  print(f'Epoch: {epoch+1:02}')
  print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc/len(train_loader.dataset):.4f}')
  print(f'\t Val. Loss: {valid_loss:.4f} |  Val. Acc: {valid_acc/len(val_loader.dataset):.4f}')

  del sys.path[0]


Epoch: 01
	Train Loss: 0.8306 | Train Acc: 0.6479
	 Val. Loss: 0.8438 |  Val. Acc: 0.6150
Epoch: 02
	Train Loss: 0.8300 | Train Acc: 0.6479
	 Val. Loss: 0.8427 |  Val. Acc: 0.6150
Epoch: 03
	Train Loss: 0.8290 | Train Acc: 0.6479
	 Val. Loss: 0.8412 |  Val. Acc: 0.6150
Epoch: 04
	Train Loss: 0.8300 | Train Acc: 0.6479
	 Val. Loss: 0.8394 |  Val. Acc: 0.6150
Epoch: 05
	Train Loss: 0.8254 | Train Acc: 0.6479
	 Val. Loss: 0.8380 |  Val. Acc: 0.6150
