<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Midterm_Perceptron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# Importing libraries that will be used 
import numpy as np
import tarfile
import glob
import re
import pandas as pd
#from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading the data

In [29]:
!wget http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

--2021-10-07 01:42:35--  http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3127238 (3.0M) [application/x-gzip]
Saving to: ‘review_polarity.tar.gz.2’


2021-10-07 01:42:35 (13.9 MB/s) - ‘review_polarity.tar.gz.2’ saved [3127238/3127238]



In [30]:
# Untar the dataset
my_tar = tarfile.open('/content/review_polarity.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()


In [31]:
# Exploring the data sizes

paths_pos = glob.glob('/content/txt_sentoken/pos/*.txt')
paths_neg = glob.glob('/content/txt_sentoken/neg/*.txt')
pos_neg_paths = paths_pos + paths_neg

n_pos = len(paths_pos)
n_neg = len(paths_neg)

print('the number of positive instances: {} \nthe number of positive instances: {}'.format(n_pos, n_neg))

the number of positive instances: 1000 
the number of positive instances: 1000


In [32]:
# Reformating the dataset into csv for convenience 

def to_df (folder):
  data_dic = {}
  data_dic['doc'], data_dic['label'] = [], []
  for file in folder:
    fo = open(file)
    doc = fo.read()
    data_dic['doc'].append(doc)
    if 'pos' in file:
      data_dic['label'].append(1)
    elif 'neg' in file:
      data_dic['label'].append(-1)
    else:
      print('error', file)
  df = pd.DataFrame.from_dict(data_dic)
  return df
    
data = to_df(pos_neg_paths)

data.head()

Unnamed: 0,doc,label
0,the long kiss goodnight ( r ) meryl streep tri...,1
1,there seem to be two reactions to dark city . ...,1
2,note : some may consider portions of the follo...,1
3,i swear i have seen the edge before . \nin fac...,1
4,"in some respects , rush hour is the ultimate e...",1


# Data pre-processing

In [33]:
# Data preprocessing
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# Exploring the words in the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def tokenizer (doc):
  return re.split("\W+", doc)   # return a list of tokens without punctuations

def stopword_remover (bow):
  filtered_bow = [w for w in bow if not w.lower() in stopwords]
  return filtered_bow

def top_freq_w (freq_dic, top_n, stopword_removing = ''):
  sorted_dic = {k:v for k, v in sorted(freq_dic.items(), key = lambda item: item[1], reverse=True)}
  if stopword_removing is False:
    return {k:v for k, v in list(sorted_dic.items())[:top_n]}
  elif stopword_removing is True:
    filtered_dic = {k: v for k, v in sorted_dic.items() if k not in stopwords}
    return {k:v for k, v in list(filtered_dic.items())[:top_n]}
  

def preprocessor (text):
  ## removing punctuations and characters
  text = re.sub(r'[^\w\s]', '', text)
  # stripping
  text = ' '.join([w.strip() for w in text.split()])
  ## lowcasing
  text = text.lower()
  # ## removing stopword
  text = stopword_remover (text.split())
  # ##stemmming
  text = [stemmer.stem(w) for w in text]
  # ## lematization
  text = [lemmatizer.lemmatize(w) for w in text]
  return ' '.join([w for w in text])

data['doc'] = data['doc'].apply(lambda x:  preprocessor (x) )
data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,doc,label
0,long kiss goodnight r meryl streep tri fail ev...,1
1,seem two reaction dark citi either love unimpr...,1
2,note may consid portion follow text spoiler fo...,1
3,swear seen edg fact remind bear river wild var...,1
4,respect rush hour ultim exercis clich filmmak ...,1
...,...,...
1995,sandra bullock high heel wield chainsaw yup go...,-1
1996,senseless prime exampl happen tri push onejok ...,-1
1997,mr bean bumbl secur guard england sent la help...,-1
1998,thing wors watch bad movi realiz film lot pote...,-1


In [34]:
# Spliting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split (data['doc'], data['label'] , train_size = 0.8, random_state = 42, shuffle = True, stratify=data['label'])
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape)

Shapes of X_train, y_train:  (1600,) (1600,)
Shapes of X_test, y_test:  (400,) (400,)


In [35]:
len(X_train.to_list())

1600

# Feature Representation
As requested, we are going to extract 10000 words from the training set for representing the documents

In [36]:
## As required, we will use 10000 most frequent word from training, excluding stopwords

# Getting terms and their frequency 
word_freq = {}
for doc in X_train.to_list():
  for token in tokenizer (doc):
    word_freq[token] = word_freq.get(token,0)+1

# Getting 10000 terms with highest frequency, excluding stopwords
vocabulary = top_freq_w(word_freq, 10000, stopword_removing = True)


In [37]:
# binary representing documents based on occurrance of features in documents
def doc_representor (doc):
  doc_vec = []
  token_list = tokenizer (doc)
  for feature in vocabulary.keys():
    if feature in token_list:
      doc_vec.append(1)
    else:
       doc_vec.append(0)
  return doc_vec

X_train = X_train.apply(lambda x: doc_representor(x))
X_test = X_test.apply(lambda x: doc_representor(x))


In [38]:
# Visualize the data after representing
print('feature representation of documents in TRAINING set')
X_train = X_train.apply(pd.Series)
X_train.columns = vocabulary.keys()
print(X_train.shape)
X_train

feature representation of documents in TRAINING set
(1600, 10000)


Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,good,play,stori,see,would,much,also,go,way,two,end,seem,look,take,first,come,well,work,thing,realli,perform,plot,know,year,littl,peopl,life,love,could,never,...,epstein,drawer,pelt,unison,kristi,fallaci,su,sank,sherilyn,fenn,boull,wellintent,paw,screentim,miscalcul,mca,mammoth,eraserhead,harkonnen,fremen,haphazard,carlo,eno,dissatisfact,stung,radha,cholodenko,receptionist,indecipher,clarkson,handinhand,paymer,firth,mountainsid,underwear,rigor,enrag,peanut,audibl,commenc
478,1,1,1,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,1,1,0,0,1,0,1,1,1,1,0,1,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
488,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,0,1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1499,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1605,0,0,1,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
511,1,1,1,0,0,1,0,1,0,0,1,1,0,0,0,1,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,1,1,1,1,1,1,0,1,1,1,0,0,1,0,0,1,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1452,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
248,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
303,1,0,1,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
# Visualize the data after representing
print('feature representation of documents in TEST set')
X_test = X_test.apply(pd.Series)
X_test.columns = vocabulary.keys()
print(X_test.shape)
X_test

feature representation of documents in TEST set
(400, 10000)


Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,good,play,stori,see,would,much,also,go,way,two,end,seem,look,take,first,come,well,work,thing,realli,perform,plot,know,year,littl,peopl,life,love,could,never,...,epstein,drawer,pelt,unison,kristi,fallaci,su,sank,sherilyn,fenn,boull,wellintent,paw,screentim,miscalcul,mca,mammoth,eraserhead,harkonnen,fremen,haphazard,carlo,eno,dissatisfact,stung,radha,cholodenko,receptionist,indecipher,clarkson,handinhand,paymer,firth,mountainsid,underwear,rigor,enrag,peanut,audibl,commenc
1748,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,0,0,1,0,1,1,1,0,1,0,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1020,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
88,1,1,1,0,1,1,1,1,0,1,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1466,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,0,1,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1491,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,1,0,1,1,1,1,1,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1821,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,1,0,0,1,0,1,1,1,0,1,0,1,1,0,0,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1099,0,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1654,1,1,1,0,1,1,1,1,1,1,0,1,1,0,0,1,0,1,1,0,0,0,1,1,0,1,1,1,0,0,0,0,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Perceptron Model

In [41]:
# Writing functions

def training (X, Y):
  n_iter = 10
  weight = np.zeros(X.shape[1])
  bias = 0
 # print(X[1:])
  for i in range(n_iter):
    loss = 0
    for x, y in zip(X.values,Y):
      a = sum(np.dot(weight, X.values.T) + bias)
      ya = np.dot(y, a)
      if ya <=0:
        weight += np.dot(x, y)
        bias +=  y
      loss += np.max([0, ya*(-1)])
    print('averaged loss in iteration {}: {}'.format(i, np.average(loss))
  return weight, bias

def average_loss(ya):
  

def predict (X, weight, bias):
 a = np.dot(weight, X.values.T) + bias
 y_class = [1 if i >= 0 else -1 for i in a]
 return y_class

def printing_eval_scores (y_true, y_pred, report=''):
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  precision = sklearn.metrics.precision_score(y_true, y_pred)
  recall = sklearn.metrics.recall_score(y_true, y_pred)
  f1 = sklearn.metrics.f1_score(y_true, y_pred)
  print('accuracy score: {:.3f}'.format(accuracy))
  print('precision score: {:.3f}'.format(precision))
  print('recall score: {:.3f}'.format(recall))
  print('F1 score: {:.3f}'.format(f1))
  if report is True:
    print(classification_report(y_true, y_pred))
  else:
    pass
  return accuracy, precision, recall, f1

weight, bias = training (X_train, y_train)
y_pred = predict (X_test, weight, bias)


## on training set
print('Model performance on training set:')
printing_eval_scores (y_train, predict(X_train,weight, bias), report = False)

## on test set
print('\nModel performance on test set:')
printing_eval_scores (y_test, y_predict, report = True)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
54327.0
54327.0
-2448.0
47119.0
-16464.0
-16464.0
38062.0
-37044.0
-37044.0
-37044.0
12581.0
12581.0
12581.0
-45572.0
5136.0
-60447.0
20711.0
20711.0
20711.0
-21145.0
-21145.0
-21145.0
-21145.0
-21145.0
-21145.0
-21145.0
-21145.0
-21145.0
54251.0
54251.0
-11018.0
39823.0
-20124.0
-20124.0
-20124.0
52090.0
52090.0
52090.0
-21541.0
-21541.0
47487.0
-16882.0
69388.0
-3717.0
-3717.0
60756.0
60756.0
60756.0
-20765.0
50181.0
-14206.0
-14206.0
27779.0
27779.0
-30279.0
-30279.0
67037.0
24994.0
-48259.0
-10329.0
84825.0
84825.0
84825.0
84825.0
84825.0
84825.0
84825.0
40827.0
767.0
-100343.0
-38692.0
13657.0
-46825.0
29448.0
29448.0
-41559.0
-41559.0
59188.0
59188.0
25330.0
-35586.0
38351.0
38351.0
38351.0
38351.0
38351.0
38351.0
38351.0
-2490.0
56101.0
-802.0
-802.0
27558.0
27558.0
27558.0
27558.0
-25072.0
23923.0
-53676.0
-53676.0
-53676.0
-53676.0
23266.0
23266.0
-34243.0
-34243.0
38600.0
38600.0
-19949.0
68137.0
68137.0
68137.0

accuracy score: 0.833
precision score: 0.845
recall score: 0.815
F1 score: 0.830
              precision    recall  f1-score   support

          -1       0.82      0.85      0.84       200
           1       0.84      0.81      0.83       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



(0.8325, 0.844559585492228, 0.815, 0.8295165394402036)