<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Midterm_Perceptron.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries that will be used 
import numpy as np
import tarfile
import glob
import re
import pandas as pd
#from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading the data

In [2]:
!wget http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

--2021-10-04 18:31:17--  http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.36
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.36|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3127238 (3.0M) [application/x-gzip]
Saving to: ‘review_polarity.tar.gz.1’


2021-10-04 18:31:17 (15.5 MB/s) - ‘review_polarity.tar.gz.1’ saved [3127238/3127238]



In [3]:
# Untar the dataset
my_tar = tarfile.open('/content/review_polarity.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()


In [4]:
# Exploring the data sizes

paths_pos = glob.glob('/content/txt_sentoken/pos/*.txt')
paths_neg = glob.glob('/content/txt_sentoken/neg/*.txt')
pos_neg_paths = paths_pos + paths_neg

n_pos = len(paths_pos)
n_neg = len(paths_neg)

print('the number of positive instances: {} \nthe number of positive instances: {}'.format(n_pos, n_neg))

the number of positive instances: 1000 
the number of positive instances: 1000


In [5]:
# Reformating the dataset into csv for convenience 

def to_df (folder):
  data_dic = {}
  data_dic['doc'], data_dic['label'] = [], []
  for file in folder:
    fo = open(file)
    doc = fo.read()
    data_dic['doc'].append(doc)
    if 'pos' in file:
      data_dic['label'].append(1)
    elif 'neg' in file:
      data_dic['label'].append(0)
    else:
      print('error', file)
  df = pd.DataFrame.from_dict(data_dic)
  return df
    
data = to_df(pos_neg_paths)

data.head()

Unnamed: 0,doc,label
0,the truman show ( paramount pictures ) running...,1
1,the sweet hereafter could serve as a textbook ...,1
2,america has finally gotten what it's needed fo...,1
3,"bill condon's "" gods and monsters "" is a fasci...",1
4,"eight years after its release , disney has dec...",1


# Data pre-processing

In [6]:
# Data preprocessing
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# Exploring the words in the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def tokenizer (doc):
  return re.split("\W+", doc)   # return a list of tokens without punctuations

def stopword_remover (bow):
  filtered_bow = [w for w in bow if not w.lower() in stopwords]
  return filtered_bow

def top_freq_w (freq_dic, top_n, stopword_removing = ''):
  sorted_dic = {k:v for k, v in sorted(freq_dic.items(), key = lambda item: item[1], reverse=True)}
  if stopword_removing is False:
    return {k:v for k, v in list(sorted_dic.items())[:top_n]}
  elif stopword_removing is True:
    filtered_dic = {k: v for k, v in sorted_dic.items() if k not in stopwords}
    return {k:v for k, v in list(filtered_dic.items())[:top_n]}
  

def preprocessor (text):
  ## removing punctuations and characters
  text = re.sub(r'[^\w\s]', '', text)
  # stripping
  text = ' '.join([w.strip() for w in text.split()])
  ## lowcasing
  text = text.lower()
  # ## removing stopword
  text = stopword_remover (text.split())
  # ##stemmming
  text = [stemmer.stem(w) for w in text]
  # ## lematization
  text = [lemmatizer.lemmatize(w) for w in text]
  return ' '.join([w for w in text])

data['doc'] = data['doc'].apply(lambda x:  preprocessor (x) )
data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,doc,label
0,truman show paramount pictur run time 1 hour 4...,1
1,sweet hereaft could serv textbook exampl diffe...,1
2,america final gotten need year compassion sinc...,1
3,bill condon god monster fascin look last day l...,1
4,eight year releas disney decid rereleas littl ...,1
...,...,...
1995,well stellar effect movi that realli found wat...,0
1996,spawn may somewhat older film fact probabl vid...,0
1997,watch movi vow subtract half star review filmm...,0
1998,alexand duma three musket one oftenadapt liter...,0


In [7]:
# Spliting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split (data['doc'], data['label'] , train_size = 0.8, random_state = 42, shuffle = True, stratify=data['label'])
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape)

Shapes of X_train, y_train:  (1600,) (1600,)
Shapes of X_test, y_test:  (400,) (400,)


In [8]:
len(X_train.to_list())

1600

# Feature Representation
As requested, we are going to extract 10000 words from the training set for representing the documents

In [9]:
## As required, we will use 10000 most frequent word from training, excluding stopwords

# Getting terms and their frequency 
word_freq = {}
for doc in X_train.to_list():
  for token in tokenizer (doc):
    word_freq[token] = word_freq.get(token,0)+1

# Getting 10000 terms with highest frequency, excluding stopwords
vocabulary = top_freq_w(word_freq, 10000, stopword_removing = True)


In [10]:
# binary representing documents based on occurrance of features in documents
def doc_representor (doc):
  doc_vec = []
  token_list = tokenizer (doc)
  for feature in vocabulary.keys():
    if feature in token_list:
      doc_vec.append(1)
    else:
       doc_vec.append(0)
  return doc_vec

X_train = X_train.apply(lambda x: doc_representor(x))
X_test = X_test.apply(lambda x: doc_representor(x))


In [11]:
# Visualize the data after representing
print('feature representation of documents in TRAINING set')
X_train = X_train.apply(pd.Series)
X_train.columns = vocabulary.keys()
print(X_train.shape)
X_train

feature representation of documents in TRAINING set
(1600, 10000)


Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,stori,play,good,see,would,much,go,also,way,look,end,take,seem,two,well,come,first,work,thing,realli,year,know,plot,littl,perform,peopl,life,love,could,bad,...,hijink,jnr,ardent,giosu,junctur,latifah,sergio,allegi,vista,banner,tcheki,kjv,consol,halperin,morbid,miscalcul,glib,unengag,coattail,insinu,seren,allud,hierarchi,faceti,munch,bandag,kicker,headon,pipelin,dilut,milit,errand,earthi,hassl,machinegun,1939,gamut,bloom,nielson,writingdirect
478,0,0,1,0,1,0,1,0,1,1,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
488,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1499,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,0,0,1,1,0,1,1,1,0,0,1,1,1,0,1,1,1,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1605,0,1,1,1,0,1,0,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
511,1,1,1,0,1,0,1,1,1,1,0,0,1,0,1,0,0,1,0,0,0,1,0,1,1,1,0,0,0,1,1,0,0,0,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1452,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,0,1,0,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
248,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,0,1,1,0,0,1,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
303,1,1,1,1,0,0,1,1,1,0,1,0,1,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
# Visualize the data after representing
print('feature representation of documents in TEST set')
X_test = X_test.apply(pd.Series)
X_test.columns = vocabulary.keys()
print(X_test.shape)
X_test

feature representation of documents in TEST set
(400, 10000)


Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,stori,play,good,see,would,much,go,also,way,look,end,take,seem,two,well,come,first,work,thing,realli,year,know,plot,littl,perform,peopl,life,love,could,bad,...,hijink,jnr,ardent,giosu,junctur,latifah,sergio,allegi,vista,banner,tcheki,kjv,consol,halperin,morbid,miscalcul,glib,unengag,coattail,insinu,seren,allud,hierarchi,faceti,munch,bandag,kicker,headon,pipelin,dilut,milit,errand,earthi,hassl,machinegun,1939,gamut,bloom,nielson,writingdirect
1748,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1020,1,1,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,0,1,0,0,0,1,1,1,1,0,0,0,0,1,0,0,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
88,0,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1466,1,0,1,1,1,1,0,1,1,1,0,1,0,1,1,1,1,0,0,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1491,1,1,1,1,1,0,1,0,0,0,1,0,0,0,1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,1,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1821,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,0,1,1,1,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1099,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1654,1,1,1,0,1,1,1,1,1,1,0,0,1,1,1,0,0,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,0,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
