In [None]:
!pip install keras_preprocessing

In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable

In [2]:
import argparse

#-----------------------------parser-----------------------------
class MyArgs:
    def __init__(self):
        self.epochs=5
        self.learning_rate=1e-5
        self.hidden_dim=128
        self.lstm_layers=1
        self.batch_size=64
        self.test_size=0.1
        self.max_len=512
        self.max_words=28996

In [22]:
import numpy as np
import pandas as pd

#from keras.preprocessing import sequence
from keras_preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split


class Preprocessing:
	
	def __init__(self, args):
		self.data = '/kaggle/input/reviewdata/review_cpu_label_map.csv'
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = args.test_size
		
	def load_data(self):
		df = pd.read_csv(self.data)
		df.drop(['index'], axis=1, inplace=True)
		
		X = df['review'].values
		Y = df['cpu_label'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

class PreprocessingFT:
	
	def __init__(self, args):
		self.data = '/kaggle/input/needdata/need_cpu_label_map.csv'
		self.max_len = args.max_len
		self.max_words = args.max_words
		self.test_size = 0.5
		
	def load_data(self):
		df = pd.read_csv(self.data)
		df.drop(['index'], axis=1, inplace=True)
		
		X = df['need'].values
		Y = df['cpu_label'].values
		
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		sequences = self.tokens.texts_to_sequences(x)
		return sequence.pad_sequences(sequences, maxlen=self.max_len)

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TweetClassifier(nn.ModuleList):

	def __init__(self, args):
		super(TweetClassifier, self).__init__()
		
		self.batch_size = args.batch_size
		self.hidden_dim = args.hidden_dim
		self.LSTM_layers = args.lstm_layers
		self.input_size = args.max_words # embedding dimention
		
		self.dropout = nn.Dropout(0.5)
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0)
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=257)
		#self.fc2 = nn.Linear(257, 1)
		#change output_channels according to label size
		self.fc2 = nn.Linear(257, 4)#label size
		self.softmax=nn.LogSoftmax(dim=1)
		
	def forward(self, x):
	
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim),device=x.device)
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim),device=x.device)
		
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)

		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
		#out = torch.sigmoid(self.fc2(out))
		out = self.softmax(self.fc2(out))

		return out

In [24]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import csv

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''
	def __init__(self, x, y):
		self.x = x
		self.y = y
		
	def __len__(self):
		return len(self.x)
		
	def __getitem__(self, idx):
		return self.x[idx], self.y[idx]
		

class Execute:
	'''
	Class for execution. Initializes the preprocessing as well as the 
	Tweet Classifier model
	'''

	def __init__(self, args, device):
		self.__init_data__(args)
		
		self.args = args
		self.batch_size = args.batch_size
		self.device=device
		self.model = TweetClassifier(args)
		self.model.to(self.device)
		self.CEL=nn.NLLLoss()
		
	def __init_data__(self, args):
		'''
		Initialize preprocessing from raw dataset to dataset split into training and testing
		Training and test datasets are index strings that refer to tokens
		'''
		self.preprocessing = Preprocessing(args)
		self.preprocessing.load_data()
		self.preprocessing.prepare_tokens()

		raw_x_train = self.preprocessing.x_train
		raw_x_test = self.preprocessing.x_test
		
		self.y_train = self.preprocessing.y_train
		self.y_test = self.preprocessing.y_test

		self.x_train = self.preprocessing.sequence_to_token(raw_x_train)
		self.x_test = self.preprocessing.sequence_to_token(raw_x_test)
        
		#------------fine tune dataset processing----------------------
		self.preprocessingft = PreprocessingFT(args)
		self.preprocessingft.load_data()
		self.preprocessingft.prepare_tokens()

		raw_x_finetune = self.preprocessingft.x_train
		raw_x_fttest = self.preprocessingft.x_test
		
		self.y_finetune = self.preprocessingft.y_train
		self.y_fttest = self.preprocessingft.y_test

		self.x_finetune = self.preprocessingft.sequence_to_token(raw_x_finetune)
		self.x_fttest = self.preprocessingft.sequence_to_token(raw_x_fttest)
		
	def train(self):
		
		training_set = DatasetMaper(self.x_train, self.y_train)
		test_set = DatasetMaper(self.x_test, self.y_test)
        
		finetune_set = DatasetMaper(self.x_finetune, self.y_finetune)
		fttest_set = DatasetMaper(self.x_fttest, self.y_fttest)
		
		self.loader_training = DataLoader(training_set, batch_size=self.batch_size)
		self.loader_test = DataLoader(test_set)
        
		self.loader_finetune = DataLoader(finetune_set, batch_size=self.batch_size)
		self.loader_fttest = DataLoader(fttest_set)
		
		#optimizer = optim.RMSprop(self.model.parameters(), lr=args.learning_rate)
		optimizer = optim.Adam(self.model.parameters(), lr=args.learning_rate)
		for epoch in range(args.epochs):
			
			predictions = []
			
			self.model.train()
			
			for x_batch, y_batch in self.loader_training:
				
				x = x_batch.type(torch.LongTensor)
				#x.to(self.device)
				x=torch.tensor(x,device=self.device)
				#print("x device--->",x.device)
				#print("model device--->",next(self.model.parameters()).is_cuda)
				#for m in self.model.parameters():
					#print(m.device) #return cuda:0
				#y = y_batch.type(torch.FloatTensor)
				y = y_batch.type(torch.LongTensor)
				y=torch.tensor(y,device=self.device)
				
				y_pred = self.model(x)
				#y = y.unsqueeze(1)
				#y = y.type(torch.LongTensor)
				#loss = F.binary_cross_entropy(y_pred, y)
				#print("hjh check shape----->",y_pred.shape,y.shape)
				loss = self.CEL(y_pred, y)
				
				optimizer.zero_grad()
				
				loss.backward()
				
				optimizer.step()
				#torch.argmax(y_pred,dim=-1)
				#predictions += list(y_pred.squeeze().detach().numpy())
				predictions += list(torch.argmax(y_pred,dim=-1).cpu().detach().numpy())
				#break
			
			test_predictions = self.evaluation('train')
			
			train_accuary = self.calculate_accuray(self.y_train, predictions)
			test_accuracy = self.calculate_accuray(self.y_test, test_predictions)
			
			print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
			
			
		#----------------------fine tune and test---------------------------
		for epoch in range(20):
			
			predictions = []
			
			self.model.train()
			
			for x_batch, y_batch in self.loader_finetune:
				
				x = x_batch.type(torch.LongTensor)
				x=torch.tensor(x,device=self.device)
				y = y_batch.type(torch.LongTensor)
				y=torch.tensor(y,device=self.device)
				
				y_pred = self.model(x)
				loss = self.CEL(y_pred, y)
				
				optimizer.zero_grad()
				
				loss.backward()
				
				optimizer.step()
				predictions += list(torch.argmax(y_pred,dim=-1).cpu().detach().numpy())
			
			#test_predictions = self.evaluation('test')
			test_predictions, test_all_pred = self.evaluation('test')
			
			train_accuary = self.calculate_accuray(self.y_finetune, predictions)
			test_accuracy = self.calculate_accuray(self.y_fttest, test_predictions)
			
			print("Epoch: %d, loss: %.5f, Train accuracy: %.5f, Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
			
			if((epoch==4 or epoch==9 or epoch==14 or epoch==19)==False):
				continue
			
			save_path="/kaggle/working/baselineLSTM_cpu_epoch_"+str(epoch+1)+"_test_res.csv"
			n=len(self.y_fttest)
			record=[]
			for j in range(0,n):
				tmp={"index":j, "label":self.y_fttest[j], "prediction":test_predictions[j], "all_pred":test_all_pred[j]}
				record.append(tmp)

			with open(save_path, 'w', newline='') as csvfile:
				fieldnames = ['index', 'label','prediction','all_pred']
				writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

				writer.writeheader()
				writer.writerows(record)
			print(save_path)
            
	def evaluation(self,mode):

		predictions = []
		self.model.eval()
		if(mode=='train'):
			loader=self.loader_test
		elif(mode=='test'):
			loader=self.loader_fttest
		with torch.no_grad():
			#for x_batch, y_batch in self.loader_test:
			all_pred=[]#add
			for x_batch, y_batch in loader:
				x = x_batch.type(torch.LongTensor)
				x=torch.tensor(x,device=self.device)
				#y = y_batch.type(torch.FloatTensor)
				y = y_batch.type(torch.LongTensor)
				y=torch.tensor(y,device=self.device)
				
				y_pred = self.model(x)
				all_pred += list(y_pred.cpu().detach().numpy())#add
				predictions += list(torch.argmax(y_pred,dim=-1).cpu().detach().numpy())
				
		if(mode=='train'):
			return predictions
		elif(mode=='test'):
			return predictions,all_pred
			
	@staticmethod
	def calculate_accuray(grand_truth, predictions):
		true_positives = 0
		true_negatives = 0
		#print(len(grand_truth),len(predictions))
		#print(grand_truth)
		#print(predictions)
		for true, pred in zip(grand_truth, predictions):
			#if (pred > 0.5) and (true == 1):
			if(pred==true):
				true_positives += 1
			#elif (pred < 0.5) and (true == 0):
			#else:
				#true_negatives += 1
			#else:
				#pass
				
		return (true_positives+true_negatives) / len(grand_truth)

In [25]:
args = MyArgs()

# specify GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(device)

execute = Execute(args,device)
execute.train()

#from keras_preprocessing.sequence import pad_sequences

cuda:0




Epoch: 1, loss: 1.38575, Train accuracy: 0.26491, Test accuracy: 0.37483
Epoch: 2, loss: 1.36121, Train accuracy: 0.34517, Test accuracy: 0.46335
Epoch: 3, loss: 1.34569, Train accuracy: 0.40959, Test accuracy: 0.47994
Epoch: 4, loss: 1.34163, Train accuracy: 0.44772, Test accuracy: 0.47994
Epoch: 5, loss: 1.32444, Train accuracy: 0.46956, Test accuracy: 0.47994




Epoch: 1, loss: 1.36714, Train accuracy: 0.31963, Test accuracy: 0.39439
Epoch: 2, loss: 1.35831, Train accuracy: 0.31589, Test accuracy: 0.39439
Epoch: 3, loss: 1.36375, Train accuracy: 0.31402, Test accuracy: 0.39439
Epoch: 4, loss: 1.37708, Train accuracy: 0.32710, Test accuracy: 0.39439
Epoch: 5, loss: 1.37859, Train accuracy: 0.30841, Test accuracy: 0.39439
/kaggle/working/baselineLSTM_cpu_epoch_5_test_res.csv
Epoch: 6, loss: 1.35470, Train accuracy: 0.29533, Test accuracy: 0.39439
Epoch: 7, loss: 1.36501, Train accuracy: 0.33645, Test accuracy: 0.39439
Epoch: 8, loss: 1.34816, Train accuracy: 0.32336, Test accuracy: 0.39439
Epoch: 9, loss: 1.35056, Train accuracy: 0.33832, Test accuracy: 0.39439
Epoch: 10, loss: 1.34927, Train accuracy: 0.32897, Test accuracy: 0.39439
/kaggle/working/baselineLSTM_cpu_epoch_10_test_res.csv
Epoch: 11, loss: 1.35989, Train accuracy: 0.31028, Test accuracy: 0.39439
Epoch: 12, loss: 1.35175, Train accuracy: 0.33271, Test accuracy: 0.39439
Epoch: 13, l

In [4]:
!pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2
[0m