In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\indra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\indra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\indra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import re
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

def preprocess_data(new_df):
    remove_html_tags='<.*?>';
    remove_urls='http\S+';
    remove_non_alpha='[^A-Za-z ]'
    remove_extra_space=' +'
    processed=[]
    new_df=list(new_df)
    for i in range(len(new_df)):
        s=str(new_df[i])
        if s=="":
            continue
        s=re.sub(remove_html_tags,"",s)
        s=re.sub(remove_urls,"",s)
        s=re.sub(remove_non_alpha,"",s)
        s=re.sub(remove_extra_space," ",s)
        if s=="":
            continue
        processed.append(s)
        
        
    stop_words = set(stopwords.words('english'))
    final_processed_text=[]
    
    for i in range(len(processed)):
        s=processed[i]
        s=s.split(" ")
        s=[ps.stem(word) for word in s if word not in stop_words]
        final_processed_text.append(s)
        
    return final_processed_text


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\indra\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
class DataLoader:
    
    def __init__(self):
        
        self.wordmap={}
        self.wordcount=0
        self.occurences={}
        self.total_sentences=0
    
    def create(self,data):
        
        for sentence in data:
            
            for i in sentence+["end"]:
                i=i.lower()
                
                if i not in self.wordmap:
                    self.wordmap[i]=[self.wordcount,0]
                    self.wordcount+=1
                self.wordmap[i][1]+=1
    
    def get_term_frequency(self,sentence,word):
        
        sentence=[i.lower() for i in sentence]
        return sentence.count(word)/len(sentence)
        
                    
                
    def get_inverse_document_frequency(self,word):
        inv_doc_freq=np.log(self.total_sentences/self.wordmap[word][1])
        return inv_doc_freq
                
    def get_embeddings(self,sentence):
        

        temp=[0]*self.wordcount
        for i in sentence:
            i=i.lower()
            if i in self.wordmap:
                temp[self.wordmap[i][0]]=self.get_term_frequency(sentence,i)*self.get_inverse_document_frequency(i)
        
        return temp

In [4]:
f=open("./data/train-labeled.txt",'r')
test_f=open("./data/dev-text.txt")
key_f=open("./data/dev-key.txt")

hashmap={"id":[],"label1":[],"label2":[],"text":[]}
for i in f.readlines():
    data=i.split(" ")
    hashmap["id"].append(data[0])
    hashmap["label1"].append(data[1])
    hashmap["label2"].append(data[2])
    hashmap["text"].append(" ".join(data[3:]).rstrip())
df=pd.DataFrame(hashmap)
df['text']=preprocess_data(df['text'])

key={i.split(" ")[0]:i.split(" ")[1:] for j in range(2) for i in key_f.readlines()}
test_map={"id":[],"text":[],"label1":[],"label2":[]}
for i in test_f.readlines():
    data=i.split(" ")
    test_map["id"].append(data[0])
    test_map["text"].append(" ".join(data[1:]))
    test_map["label1"].append(key[data[0]][0])
    test_map["label2"].append(key[data[0]][1].rstrip())
    
test_df=pd.DataFrame(test_map)
test_df['text']=preprocess_data(test_df['text'])
print(df,test_df)

          id label1 label2                                               text
0    07Zfn0z   Fake    Pos  [if, your, look, eleg, hotel, downtown, chicag...
1    08HSeiI   Fake    Pos  [the, atmospher, talbott, hotel, welcom, first...
2    0L52Itl   True    Neg  [id, search, cool, nonchain, hotel, weekend, g...
3    0LcSUgS   True    Pos  [i, vacat, fairmont, chicago, night, juli, the...
4    0N9L6lV   Fake    Neg  [the, fairmont, chicago, millennium, park, one...
..       ...    ...    ...                                                ...
955  zTPdVsT   Fake    Pos  [the, talbot, hotel, eleg, place, take, wife, ...
956  zWuJa6N   Fake    Neg  [my, husband, i, recent, stay, fairmont, chica...
957  zfeuazq   True    Neg  [i, surpris, fact, extra, sheet, blanket, make...
958  zj2gpGP   True    Neg  [i, reserv, rock, star, suit, boyfriend, birth...
959  zwf3FEc   True    Neg  [i, expect, glamor, room, i, walk, disappoint,...

[960 rows x 4 columns]           id                            

In [5]:
minlen=float("inf")
maxlen=float("-inf")
for i in df['text']:
    minlen=min(minlen,len(i))
    maxlen=max(maxlen,len(i))
print(minlen,maxlen)

13 413


In [6]:
dataloader=DataLoader()
dataloader.create(df['text'])
dataloader.total_sentences=len(df['text'])

In [7]:
def get_word2vec_embeddings(Xtrain,Xtest,fixedsize=0,dataloader=None):
    Xtrain_embeddings=[]
    Xtest_embeddings=[]
    

    for i in Xtrain:
        Xtrain_embeddings.append(dataloader.get_embeddings(i))
        
    for i in Xtest:
            
        Xtest_embeddings.append(dataloader.get_embeddings(i))
    
    
    return Xtrain_embeddings,Xtest_embeddings

In [8]:
from sklearn.model_selection import train_test_split

train_df=df.sample(frac=1)
# train_df=df
Xtrain,ytrain1,ytrain2=train_df['text'],train_df['label1'],train_df['label2']
Xtest,ytest1,ytest2=test_df['text'],test_df['label1'],test_df['label2']
Xtrain_emb,Xtest_emb=get_word2vec_embeddings(Xtrain,Xtest,fixedsize=200,dataloader=dataloader)
Xtrain_emb=np.asarray(Xtrain_emb)
Xtest_emb=np.asarray(Xtest_emb)
classmap={"Fake":-1,"Neg":-1,"Pos":1,"True":1}
ytrain1=[classmap[i] for i in ytrain1]
ytrain2=[classmap[i] for i in ytrain2]
ytest1=[classmap[i] for i in ytest1]
ytest2=[classmap[i] for i in ytest2]





class Node:
    
    def __init__(self,w_size=1000,bias=0):
        
        self.w=np.asarray([0]*w_size)
        self.bias=bias
    
    def __init__(self,w_size=1000,bias=0,avg=True):
        self.w=np.asarray([0]*w_size)
        self.bias=bias
        self.cw=np.asarray([0]*w_size)
        self.cbias=bias
        
class Perceptron:
    def __init__(self,model_type='vanilla',emb_size=4500,bias=0,n=2):
        self.emb_size=emb_size
        self.model_type=model_type
        self.perceptrons=[Node(emb_size,bias),Node(emb_size,bias,True)]
        
        
    def forward(self,inp):
        out=[]
        for i in range(len(self.perceptrons)):
            prod=np.dot(inp,self.perceptrons[i].w)
            s=prod+self.perceptrons[i].bias
            out.append(s)
        return out
    
    def train(self,data,label1,label2):

        if self.model_type=='vanilla':

            for inp,lab1,lab2 in zip(data,label1,label2):
                out=self.forward(np.asarray(inp))
                lab=[lab1,lab2]
                for i in range(len(out)):
                    if out[i]*lab[i]<=0:
                        self.perceptrons[i].w=self.perceptrons[i].w+inp*lab[i]
                        self.perceptrons[i].bias=self.perceptrons[i].bias+lab[i]
        else:
            counter=0
            for inp,lab1,lab2 in zip(data,label1,label2):
                out=self.forward(np.asarray(inp))
                lab=[lab1,lab2]
                if out[0]*lab[0]<=0:
                    
                    self.perceptrons[0].w=self.perceptrons[0].w+inp*lab[0]
                    
                    self.perceptrons[0].bias=self.perceptrons[0].bias+lab[0]
                    
#                     self.perceptrons[0].cw=self.perceptrons[0].cw+lab[0]*counter*inp
                    
#                     self.perceptrons[0].cbias=self.perceptrons[0].cbias+lab[0]*counter
                    self.perceptrons[0].cw=self.perceptrons[0].cw+self.perceptrons[0].w
                    
                    self.perceptrons[0].cbias=self.perceptrons[0].cbias+self.perceptrons[0].bias
                    
                    
                
                if out[1]*lab[1]<=0:
                    
                    self.perceptrons[1].w=self.perceptrons[1].w+inp*lab[1]
                    
                    self.perceptrons[1].bias=self.perceptrons[1].bias+lab[1]
                    
#                     self.perceptrons[1].cw=self.perceptrons[1].cw+lab[1]*counter*inp
                    
#                     self.perceptrons[1].cbias=self.perceptrons[1].cbias+lab[1]*counter
                    
                    self.perceptrons[1].cw=self.perceptrons[1].cw+self.perceptrons[1].w
                    
                    self.perceptrons[1].cbias=self.perceptrons[1].cbias+self.perceptrons[1].bias
                    
                counter+=1

            self.perceptrons[0].w=(1/counter)*self.perceptrons[0].cw
            
            self.perceptrons[0].bias=(1/counter)*self.perceptrons[0].cbias
            
            self.perceptrons[1].w=(1/counter)*self.perceptrons[1].cw
            
            self.perceptrons[1].bias=(1/counter)*self.perceptrons[1].cbias

             
                        
    def predict(self,data):
        out=[]
        for i in data:
            out.append(self.forward(i))
        pred1=[]
        pred2=[]
        for i in out:
            temp=[1,1]
            if i[0]<0:
                temp[0]=-1
            if i[1]<0:
                temp[1]=-1
            pred1.append(temp[0])
            pred2.append(temp[1])
        return pred1,pred2
    

        
        
        

In [9]:
dataloader.wordcount

5795

In [10]:
p=Perceptron(emb_size=dataloader.wordcount)

epoch_count=500

for i in range(epoch_count):
    
    p.train(Xtrain_emb,ytrain1,ytrain2)
    

pred=p.predict(Xtest_emb)

from sklearn.metrics import f1_score

print(f1_score(ytest1,pred[0]),f1_score(ytest2,pred[1]))

0.6504065040650407 0.9240121580547113


In [11]:
p=Perceptron(emb_size=dataloader.wordcount,model_type='average')

epoch_count=500

for i in range(epoch_count):

    p.train(Xtrain_emb,ytrain1,ytrain2)

pred=p.predict(Xtest_emb)
print(f1_score(ytest1,pred[0]),f1_score(ytest2,pred[1]))

0.8023598820058997 0.9184290030211479


In [12]:
p.perceptrons[0]

<__main__.Node at 0x2d30f714c48>