In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df = pd.read_csv("/content/drive/MyDrive/uOttawa/S1/NLP/project2/train_top_10_pp.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Tags,tag_count,description,code
0,0,how check uploaded file image without mime type,php,1,id like check uploaded file image file eg png ...,
1,1,how i replace special characters url,c#,1,this probably simple i simply cannot find answ...,
2,2,how fetch xml feed using aspnet,c# asp.net,2,ive decided convert windows phone app fetches ...,
3,3,php framework url conventions,php,1,a lot frameworks use url conventions like grea...,/controller/action/{id}
4,4,play framework auto javascript css minifier,javascript,1,does anyone know good play plugin automaticall...,


In [7]:
print(df.shape)

df["code"] = df["code"].fillna("")
df["description"] = df["description"].fillna("")
df = df[df['Title'].notna()]
print(df.shape)

(1898097, 6)
(1898077, 6)


In [8]:
df

Unnamed: 0.1,Unnamed: 0,Title,Tags,tag_count,description,code
0,0,how check uploaded file image without mime type,php,1,id like check uploaded file image file eg png ...,
1,1,how i replace special characters url,c#,1,this probably simple i simply cannot find answ...,
2,2,how fetch xml feed using aspnet,c# asp.net,2,ive decided convert windows phone app fetches ...,
3,3,php framework url conventions,php,1,a lot frameworks use url conventions like grea...,/controller/action/{id}
4,4,play framework auto javascript css minifier,javascript,1,does anyone know good play plugin automaticall...,
...,...,...,...,...,...,...
1898092,1898092,need help friendly urls wordpress,php,1,im creating som custom templates wordpress im ...,/%category%/%postname%/
1898093,1898093,bitwise subtraction,c#,1,given enum if i know foo contains previously i...,c
1898094,1898094,javascript resize every image load,php javascript,2,ive got code i want resize function called eve...,
1898095,1898095,update database big csv,php,1,i need every day update items mysql i upload c...,


**List of Tags**

In [9]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_bow = vectorizer.fit_transform(df['Tags'])

In [10]:
tags = vectorizer.get_feature_names()
tags



['android',
 'asp.net',
 'c#',
 'c++',
 'iphone',
 'java',
 'javascript',
 'jquery',
 'php',
 'python']

In [11]:
tags[3] = "c+"
tags

['android',
 'asp.net',
 'c#',
 'c+',
 'iphone',
 'java',
 'javascript',
 'jquery',
 'php',
 'python']

**Extract 4000 Sentences for each tag**

In [12]:
new_df = df.loc[df['Tags'].str.contains("c+")]
len(new_df)

722903

In [13]:
new_df = df.loc[df['Tags'].str.contains('android', case=False)]
print(len(new_df))
dataset=new_df[:4000]
#print(len(dataset))
for tag in tags:
  new_df = df.loc[df['Tags'].str.contains(tag)]
  new_df = new_df[:4000]
  print(len(new_df))
  dataset = dataset.append(new_df)
  dataset = dataset.drop_duplicates(keep='first')
  print(tag," ",len(dataset))


232412
4000
android   4000
4000
asp.net   7997
4000
c#   11430
4000
c+   13603
4000
iphone   17552
4000
java   19767
4000
javascript   21723
4000
jquery   24321
4000
php   27828
4000
python   31740


In [14]:
dataset.shape

(31740, 6)

In [15]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Title,Tags,tag_count,description,code
13,13,loadable kernel module notification manager,android,1,ive successfully created installed loadable ke...,
45,45,eclipse indigo allow override non activity ove...,android,1,i using eclipse indigo android development the...,
52,52,android v webview loaddatawithbaseurl load page,android,1,when im loading page using system load page on...,.loadData
54,54,android changing locale within app affect firs...,java android,2,i use code bellow setcontentview main activity...,"Locale locale = new Locale(""ar"");"
58,58,android client socket unknowhostexception,java android,2,i working server application java multithreade...,


In [16]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
y_multilabel = vectorizer.fit_transform(dataset['Tags'])

In [17]:
x_train, x_test, y_train, y_test = train_test_split(dataset, y_multilabel, test_size = 0.2, random_state = 42)

print("Number of data points in training data :", x_train.shape[0])
print("Number of data points in test data :", x_test.shape[0])

Number of data points in training data : 25392
Number of data points in test data : 6348


In [18]:
vectorizer = TfidfVectorizer(tokenizer = lambda x: x.split())

x_train_multilabel = vectorizer.fit_transform(x_train["Title"])
x_test_multilabel = vectorizer.transform(x_test["Title"])

In [19]:
print("Training data shape X : ", x_train_multilabel.shape, "Y :", y_train.shape)
print("Test data shape X : ", x_test_multilabel.shape, "Y:", y_test.shape)

Training data shape X :  (25392, 15127) Y : (25392, 10)
Test data shape X :  (6348, 15127) Y: (6348, 10)


# **Designed Neural Network**

In [20]:
import torch.optim as optim
import torch.nn as nn
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader

In [21]:
x_train_multilabel=x_train_multilabel.todense()
x_train_multilabel = torch.tensor(x_train_multilabel).type(torch.FloatTensor)
y_train = y_train.todense()
y_train = torch.tensor(y_train).type(torch.FloatTensor)

In [22]:
x_train_multilabel.shape

torch.Size([25392, 15127])

In [23]:
y_train.shape

torch.Size([25392, 10])

In [24]:
x_y_concat = zip(x_train_multilabel,y_train)
x_y_concat = list(x_y_concat)

In [25]:
batch_size =64
trainDataLoader = DataLoader(x_y_concat, shuffle=True,batch_size=batch_size)

In [26]:
class DNN(nn.Module):

    def __init__(self,num_features):
        super(DNN, self).__init__()
        
        self.fc1 = nn.Linear(num_features, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

        
        self.relu = nn.ReLU()
        self.Sigmoid = torch.nn.Sigmoid()

      
    def forward(self, x):
        #print(x.type)

        x = self.fc1(x)
        x = self.relu(x) 

        x = self.fc2(x)
        x = self.relu(x)
        
        x = self.fc3(x)
        x = self.Sigmoid(x)

        return x

In [27]:
n_epochs = 10
learning_rate = 0.001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
#loss_model = torch.nn.CrossEntropyLoss()
num_features = 15127
model_DNN = DNN(num_features).to(device)
opt = Adam(model_DNN.parameters(), lr= learning_rate)
lossFn = nn.BCELoss()

In [29]:
model_DNN

DNN(
  (fc1): Linear(in_features=15127, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
  (relu): ReLU()
  (Sigmoid): Sigmoid()
)

In [30]:
for e in range(0, n_epochs):
  
  model_DNN.train()

  print(f"epoch= {e}")

  for (x, y) in trainDataLoader:
		# send the input to the device
    (x, y) = (x.to(device), y.to(device))

    # perform a forward pass and calculate the training loss
    pred = model_DNN(x)
    loss = lossFn(pred, y)

		# zero out the gradients, perform the backpropagation step,
		# and update the weights
    opt.zero_grad()
    loss.backward()
    opt.step()


epoch= 0
epoch= 1
epoch= 2
epoch= 3
epoch= 4
epoch= 5
epoch= 6
epoch= 7
epoch= 8
epoch= 9


In [31]:
x_test_multilabel = x_test_multilabel.todense()
x_test_multilabel = torch.tensor(x_test_multilabel).type(torch.FloatTensor)
y_test = y_test.todense()
y_test = torch.tensor(y_test).type(torch.FloatTensor)

In [32]:
model_DNN.eval()

output = model_DNN(x_test_multilabel.to(device))
 
output=output.round()

In [33]:
output =output.cpu()
output= output.detach().numpy()
y_test = y_test.cpu()
y_test= y_test.detach().numpy()

In [34]:
print("Accuracy: ", metrics.accuracy_score(y_test, output))
print("Macro f1 score: ", metrics.f1_score(y_test, output, average = 'macro'))
print("Micro f1 scoore: ", metrics.f1_score(y_test, output, average = 'micro'))
print("Hamming loss: ", metrics.hamming_loss(y_test, output))

Accuracy:  0.5108695652173914
Macro f1 score:  0.6309677364916539
Micro f1 scoore:  0.6565843914966473
Hamming loss:  0.0758349086326402
