<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Assignment2_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries that will be used 
import numpy as np
import tarfile
import glob
import re
import pandas as pd
#from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
from sklearn.model_selection import train_test_split


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading the data

In [2]:
# Untar the dataset
my_tar = tarfile.open('/content/review_polarity.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()


In [3]:
# Exploring the data sizes

paths_pos = glob.glob('/content/txt_sentoken/pos/*.txt')
paths_neg = glob.glob('/content/txt_sentoken/neg/*.txt')
pos_neg_paths = paths_pos + paths_neg

n_pos = len(paths_pos)
n_neg = len(paths_neg)

print('the number of positive instances: {} \nthe number of positive instances: {}'.format(n_pos, n_neg))

the number of positive instances: 1000 
the number of positive instances: 1000


# Exploring the data

In [4]:
# Exploring the words in the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def tokenizer (doc):
  #doc = doc.lower() # Lowercase documents
  return re.split("\W+", doc)   # return a list of tokens without punctuations

# def BOW (doc):
#   bow = set()
#   for token in tokenizer (doc):
#     bow.add(token)
#   return list(bow)

#def word_counter (doc):

def stopword_remover (bow):
  filtered_bow = [w for w in bow if not w.lower() in stopwords]
  return filtered_bow

def top_freq_w (freq_dic, top_n, stopword_removing = ''):
  sorted_dic = {k:v for k, v in sorted(freq_dic.items(), key = lambda item: item[1], reverse=True)}
  if stopword_removing is False:
    return {k:v for k, v in list(sorted_dic.items())[:top_n]}
  elif stopword_removing is True:
    filtered_dic = {k: v for k, v in sorted_dic.items() if k not in stopwords}
    return {k:v for k, v in list(filtered_dic.items())[:top_n]}
  


word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  for token in tokenizer (doc):
    word_freq[token] = word_freq.get(token,0)+1

top_100_w = top_freq_w(word_freq, 100, stopword_removing = False) 

print('the number of unique words in the dataset: ', len(word_freq.keys()))
print ('top 100 most frequent words:\n', top_100_w )
print('\nthe number of words in the top 100 which are stopwords: ', len([w for w in top_100_w.keys() if w in stopwords]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
the number of unique words in the dataset:  39697
top 100 most frequent words:
 {'the': 76529, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, 'is': 25195, 'in': 21822, 's': 18513, 'it': 16107, 'that': 15924, 'as': 11378, 'with': 10792, 'for': 9961, 'his': 9587, 'this': 9578, 'film': 9517, 'i': 8889, 'he': 8864, 'but': 8634, 'on': 7385, 'are': 6949, 't': 6410, 'by': 6261, 'be': 6174, 'one': 5852, 'movie': 5771, 'an': 5744, 'who': 5692, 'not': 5577, 'you': 5316, 'from': 4999, 'at': 4986, 'was': 4940, 'have': 4901, 'they': 4825, 'has': 4719, 'her': 4522, 'all': 4373, 'there': 3770, 'like': 3690, 'so': 3683, 'out': 3637, 'about': 3523, 'up': 3405, 'more': 3347, 'what': 3322, 'when': 3258, 'which': 3161, 'or': 3148, 'she': 3141, 'their': 3122, 'some': 2985, 'just': 2905, 'can': 2882, 'if': 2799, 'we': 2775, 'him': 2633, 'into': 2623, 'even': 2565, 'only': 2495, 'than':

In [5]:
# Reformating the dataset into csv for convenience 
def to_df (folder):
  data_dic = {}
  data_dic['doc'], data_dic['label'] = [], []
  for file in folder:
    fo = open(file)
    doc = fo.read()
    data_dic['doc'].append(doc)
    if 'pos' in file:
      data_dic['label'].append(1)
    elif 'neg' in file:
      data_dic['label'].append(0)
    else:
      print('error', file)
  df = pd.DataFrame.from_dict(data_dic)
  return df
    
data = to_df(pos_neg_paths)

data.head()

Unnamed: 0,doc,label
0,is evil dead ii a bad movie ? \nit's full of t...,1
1,logical time travel movies are a near-impossib...,1
2,robocop is an intelligent science fiction thri...,1
3,historical epic as a genre was almost banished...,1
4,"mike myers , you certainly did throw us a ? fr...",1


# Data preprocessing

In [6]:
# Data preprocessing
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def preprocessor (text):
  ## removing punctuations and characters
  text = re.sub(r'[^\w\s]', '', text)
  # stripping
  text = ' '.join([w.strip() for w in text.split()])
  # print(text)
  ## lowcasing
  text = text.lower()
  # ## removing stopword
  text = stopword_remover (text.split())
  # ##stemmming
  text = [stemmer.stem(w) for w in text]
  # ## lematization
  text = [lemmatizer.lemmatize(w) for w in text]
  return ' '.join([w for w in text])

data['doc'] = data['doc'].apply(lambda x:  preprocessor (x) )
data 

Unnamed: 0,doc,label
0,evil dead ii bad movi full terribl act pointle...,1
1,logic time travel movi nearimposs consid skept...,1
2,robocop intellig scienc fiction thriller socia...,1
3,histor epic genr almost banish hollywood earli...,1
4,mike myer certainli throw u frickin bone call ...,1
...,...,...
1995,slight romant comedi feminist bent one edg tur...,0
1996,perhap best rememb recent depart news anchor s...,0
1997,corruptor big silli mess action movi complet p...,0
1998,here rariti child film attempt tackl weighti s...,0


# Developing a Logistic Regression Model

In [7]:
# Preparing vocabulary
## As required, we will use 1000 most frequent word, excluding stopwords
### Preprocessing data for building vocabulary
cleaned_word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  cleaned_doc = preprocessor(doc)
  for token in tokenizer (cleaned_doc):
    cleaned_word_freq[token] = cleaned_word_freq.get(token,0)+1

vocabulary = top_freq_w(cleaned_word_freq, 1000, stopword_removing = True) 


In [9]:
# Feature engineering
# def feature_extractor (doc):
#   doc_vec = []
#   for feature in vocabulary.keys():
#     feature_count = 0
#     if feature in tokenizer (doc):
#       feature_count+=1
#     else:
#       feature_count+=0
#     doc_vec.append(feature_count) 
#   return doc_vec

def feature_extractor (doc):
  doc_vec = []
  token_list = tokenizer (doc)
  for feature in vocabulary.keys():
    # feature_count=0
    feature_count = token_list.count(feature)
    doc_vec.append(feature_count) 
  return doc_vec

X = data['doc'].apply(lambda x: feature_extractor(x))
y = data['label']

In [10]:

X = X.apply(pd.Series)
X.columns = vocabulary.keys()
X

Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,good,play,stori,see,would,much,also,go,way,seem,end,look,two,take,first,come,well,work,thing,year,realli,plot,know,perform,littl,life,peopl,love,could,bad,...,remark,consist,nick,manner,front,cinematographi,command,menac,adam,spent,agre,standard,fairli,ground,budget,disturb,suddenli,grant,pair,appropri,90,connect,virtual,fantasi,godzilla,brown,trip,brief,cute,store,cameo,greatest,fascin,cultur,key,count,foot,satir,addit,bug
0,3,7,1,4,1,5,3,1,1,3,0,0,0,2,0,0,0,0,0,1,2,0,0,1,1,2,2,0,1,1,0,3,0,0,0,0,3,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,12,3,7,7,2,8,3,5,3,2,3,1,2,2,2,1,2,1,2,0,1,0,2,1,0,3,0,4,3,0,2,4,0,1,2,4,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,7,4,4,0,1,1,2,0,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0
3,9,0,2,3,3,0,5,3,2,3,4,7,0,0,2,1,6,1,0,0,1,2,0,2,0,0,1,0,2,2,0,1,0,0,0,4,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8,1,7,1,4,1,2,4,4,2,3,4,0,0,0,1,2,4,1,1,0,1,0,4,3,1,1,0,1,3,1,2,0,1,0,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,3,1,2,2,1,0,0,0,0,0,0,0,3,0,1,1,0,0,1,1,2,1,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,2,3,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0
1996,0,2,2,2,0,3,0,1,2,0,0,0,1,0,0,1,1,3,0,0,0,0,5,1,0,1,0,5,1,1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1997,4,5,5,5,3,1,6,0,6,1,5,1,2,0,1,1,4,0,2,2,1,0,0,1,1,0,0,0,0,0,0,5,1,1,0,1,1,0,0,1,...,0,0,5,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1998,4,3,1,2,0,2,1,0,1,0,0,1,1,2,2,1,0,0,0,2,0,0,0,1,3,0,1,0,1,2,0,0,0,0,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [11]:
# Spliting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split (X[vocabulary.keys()], y , train_size = 0.8, random_state = 42, shuffle = True, stratify=data['label'])
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape)

Shapes of X_train, y_train:  (1600, 1000) (1600,)
Shapes of X_test, y_test:  (400, 1000) (400,)


In [None]:
class LogisticRegression ():
  def __init__ (lr = '', n_iter = 10):
    lr = lr
    n_iter = n_iter
    weight = None
    bias = None
  def fit (X, y):
    
