<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Assignment2_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries that will be used 
import numpy as np
import tarfile
import glob
import re
import pandas as pd
#from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading the data

In [2]:
# Untar the dataset
my_tar = tarfile.open('/content/review_polarity.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()


In [3]:
# Exploring the data sizes

paths_pos = glob.glob('/content/txt_sentoken/pos/*.txt')
paths_neg = glob.glob('/content/txt_sentoken/neg/*.txt')
pos_neg_paths = paths_pos + paths_neg

n_pos = len(paths_pos)
n_neg = len(paths_neg)

print('the number of positive instances: {} \nthe number of positive instances: {}'.format(n_pos, n_neg))

the number of positive instances: 1000 
the number of positive instances: 1000


# Exploring the data

In [4]:
# Exploring the words in the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def tokenizer (doc):
  #doc = doc.lower() # Lowercase documents
  return re.split("\W+", doc)   # return a list of tokens without punctuations

# def BOW (doc):
#   bow = set()
#   for token in tokenizer (doc):
#     bow.add(token)
#   return list(bow)

#def word_counter (doc):

def stopword_remover (bow):
  filtered_bow = [w for w in bow if not w.lower() in stopwords]
  return filtered_bow

def top_freq_w (freq_dic, top_n, stopword_removing = ''):
  sorted_dic = {k:v for k, v in sorted(freq_dic.items(), key = lambda item: item[1], reverse=True)}
  if stopword_removing is False:
    return {k:v for k, v in list(sorted_dic.items())[:top_n]}
  elif stopword_removing is True:
    filtered_dic = {k: v for k, v in sorted_dic.items() if k not in stopwords}
    return {k:v for k, v in list(filtered_dic.items())[:top_n]}
  


word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  for token in tokenizer (doc):
    word_freq[token] = word_freq.get(token,0)+1

top_100_w = top_freq_w(word_freq, 100, stopword_removing = False) 

print('the number of unique words in the dataset: ', len(word_freq.keys()))
print ('top 100 most frequent words:\n', top_100_w )
print('\nthe number of words in the top 100 which are stopwords: ', len([w for w in top_100_w.keys() if w in stopwords]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
the number of unique words in the dataset:  39697
top 100 most frequent words:
 {'the': 76529, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, 'is': 25195, 'in': 21822, 's': 18513, 'it': 16107, 'that': 15924, 'as': 11378, 'with': 10792, 'for': 9961, 'his': 9587, 'this': 9578, 'film': 9517, 'i': 8889, 'he': 8864, 'but': 8634, 'on': 7385, 'are': 6949, 't': 6410, 'by': 6261, 'be': 6174, 'one': 5852, 'movie': 5771, 'an': 5744, 'who': 5692, 'not': 5577, 'you': 5316, 'from': 4999, 'at': 4986, 'was': 4940, 'have': 4901, 'they': 4825, 'has': 4719, 'her': 4522, 'all': 4373, 'there': 3770, 'like': 3690, 'so': 3683, 'out': 3637, 'about': 3523, 'up': 3405, 'more': 3347, 'what': 3322, 'when': 3258, 'which': 3161, 'or': 3148, 'she': 3141, 'their': 3122, 'some': 2985, 'just': 2905, 'can': 2882, 'if': 2799, 'we': 2775, 'him': 2633, 'into': 2623, 'even': 2565, 'only': 2495, 'than':

In [5]:
# Reformating the dataset into csv for convenience 
def to_df (folder):
  data_dic = {}
  data_dic['doc'], data_dic['label'] = [], []
  for file in folder:
    fo = open(file)
    doc = fo.read()
    data_dic['doc'].append(doc)
    if 'pos' in file:
      data_dic['label'].append(1)
    elif 'neg' in file:
      data_dic['label'].append(0)
    else:
      print('error', file)
  df = pd.DataFrame.from_dict(data_dic)
  return df
    
data = to_df(pos_neg_paths)

data.head()

Unnamed: 0,doc,label
0,"ingredients : london gal , fate , true love , ...",1
1,"quiz show , an almost perfectly accurate true ...",1
2,after a stylistic detour with mrs . \nparker a...,1
3,"all great things come to an end , and the dot-...",1
4,melvin udall is a heartless man . \nhe spends ...,1


# Data preprocessing

In [6]:
# Data preprocessing
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
def preprocessor (text):
  ## removing punctuations and characters
  text = re.sub(r'[^\w\s]', '', text)
  # stripping
  text = ' '.join([w.strip() for w in text.split()])
  # print(text)
  ## lowcasing
  text = text.lower()
  # ## removing stopword
  text = stopword_remover (text.split())
  # ##stemmming
  text = [stemmer.stem(w) for w in text]
  # ## lematization
  text = [lemmatizer.lemmatize(w) for w in text]
  return ' '.join([w for w in text])

data['doc'] = data['doc'].apply(lambda x:  preprocessor (x) )
data 

Unnamed: 0,doc,label
0,ingredi london gal fate true love run joke mon...,1
1,quiz show almost perfectli accur true stori ba...,1
2,stylist detour mr parker viciou circl despit u...,1
3,great thing come end dotcom era embodi perfect...,1
4,melvin udal heartless man spend day insid spac...,1
...,...,...
1995,often similar littl boy lost park right ventur...,0
1996,13th warrior reek badli melodrama poor act car...,0
1997,accord hitchcock variou filmmak isol motel din...,0
1998,warn follow review contain spoiler cast gari s...,0


# Developing a Logistic Regression Model

In [7]:
# Preparing vocabulary
## As required, we will use 1000 most frequent word, excluding stopwords
### Preprocessing data for building vocabulary
cleaned_word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  cleaned_doc = preprocessor(doc)
  for token in tokenizer (cleaned_doc):
    cleaned_word_freq[token] = cleaned_word_freq.get(token,0)+1

vocabulary = top_freq_w(cleaned_word_freq, 1000, stopword_removing = True) 


In [8]:
# Feature engineering
# def feature_extractor (doc):
#   doc_vec = []
#   for feature in vocabulary.keys():
#     feature_count = 0
#     if feature in tokenizer (doc):
#       feature_count+=1
#     else:
#       feature_count+=0
#     doc_vec.append(feature_count) 
#   return doc_vec

def feature_extractor (doc):
  doc_vec = []
  token_list = tokenizer (doc)
  for feature in vocabulary.keys():
    # feature_count=0
    feature_count = token_list.count(feature)
    doc_vec.append(feature_count) 
  return doc_vec

X = data['doc'].apply(lambda x: feature_extractor(x))
y = data['label']

In [9]:

X = X.apply(pd.Series)
X.columns = vocabulary.keys()
X

Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,good,play,stori,see,would,much,also,go,way,seem,look,end,two,take,first,come,well,work,thing,year,realli,plot,know,perform,littl,life,peopl,love,could,bad,...,rush,realist,scare,manner,command,standard,menac,spent,adam,agre,cinematographi,front,ground,budget,fairli,pair,virtual,suddenli,fantasi,connect,disturb,90,appropri,godzilla,brown,grant,cultur,greatest,store,trip,key,fascin,cute,brief,cameo,count,foot,addit,satir,bug
0,3,0,4,1,2,2,0,0,2,0,0,1,7,0,0,0,2,1,2,2,1,0,7,4,0,0,0,1,0,1,1,0,2,1,1,6,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,1,0,0,1,0,1,0,1,2,0,2,1,2,0,1,1,0,1,2,0,2,0,0,1,1,0,2,0,0,0,0,1,0,0,3,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4,4,3,2,7,0,4,0,0,0,0,1,1,1,0,3,0,0,0,2,0,1,3,1,0,0,0,1,0,0,0,1,1,1,1,0,1,7,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,1,0,1,2,0,0,2,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,2,0,2,0,0,1,1,0,0,0,1,4,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,2,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,10,1,5,0,2,0,2,2,1,0,0,2,0,0,1,0,1,0,0,5,0,2,2,1,2,0,0,1,1,2,0,2,1,1,2,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1996,6,1,2,2,1,0,2,1,1,1,0,0,0,0,2,0,0,0,0,2,1,2,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1997,3,0,2,0,0,2,0,1,0,0,0,4,0,0,1,0,0,0,0,0,0,2,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,14,4,8,5,6,3,1,4,3,2,0,0,0,2,3,0,2,1,1,1,0,2,1,1,2,2,0,1,0,0,0,1,1,1,1,0,2,0,2,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


In [10]:
# Spliting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split (X[vocabulary.keys()], y , train_size = 0.8, random_state = 42, shuffle = True, stratify=data['label'])
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape)

Shapes of X_train, y_train:  (1600, 1000) (1600,)
Shapes of X_test, y_test:  (400, 1000) (400,)


In [11]:
X_train.values

array([[10,  1,  1, ...,  0,  0,  0],
       [ 6,  0,  4, ...,  0,  0,  0],
       [ 0,  3,  4, ...,  0,  0,  0],
       ...,
       [ 5,  5,  2, ...,  0,  0,  0],
       [ 5,  2,  3, ...,  0,  0,  0],
       [ 4,  3,  6, ...,  0,  0,  0]])

In [12]:
# class LogisticRegression ():
#   def __init__ (lr = '', n_iter = 10):
lr = 0.1
n_iter = 10
weight = None
bias = None
def computing_gradient (X, Y):
  weight = np.zeros(X.shape[1])
  bias = 0
  for iter in range(n_iter):
    for x,y in zip (X.values,Y):
      z = np.dot(weight, x) + bias
      y_pred = sigmoid(z)
      #loss = -(y*log(y_pred)+(1-y)*log(1-y_pred))
      d_weight = np.dot((y_pred - y), x)
      d_bias = (y_pred - y)
      weight -= lr*d_weight
      bias -= lr*d_bias
  return  weight, bias

def predict(X, weight, bias):
    z = np.dot(weight, X.values.T) + bias
    y_pred = sigmoid(z)
    y_class = [1 if i > 0.5 else 0 for i in y_pred]
    return y_class

def sigmoid (z):
  p=1/(1+np.exp(-z))
  return p

# Printing model performance 
def printing_eval_scores (y_true, y_pred):
  print('accuracy score: {}'.format(sklearn.metrics.accuracy_score(y_true, y_pred)))
  print('precision score: {}'.format(sklearn.metrics.precision_score(y_true, y_pred)))
  print('recall score: {}'.format(sklearn.metrics.recall_score(y_true, y_pred)))
  print('F1 score: {}'.format(sklearn.metrics.f1_score(y_true, y_pred)))
  print(classification_report(y_true, y_pred))


weight, bias = computing_gradient (X_train, y_train)
y_predict = predict(X_test,weight, bias)

# Model performing
## on training set
print('Model performance on training set:')
printing_eval_scores (y_train, predict(X_train,weight, bias))

## on test set
print('\n===========================')
print('Model performance on test set:')
printing_eval_scores (y_test, y_predict)


    


Model performance on training set:
accuracy score: 0.965625
precision score: 0.9408284023668639
recall score: 0.99375
F1 score: 0.966565349544073
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       800
           1       0.94      0.99      0.97       800

    accuracy                           0.97      1600
   macro avg       0.97      0.97      0.97      1600
weighted avg       0.97      0.97      0.97      1600


Model performance on test set:
accuracy score: 0.7925
precision score: 0.76
recall score: 0.855
F1 score: 0.8047058823529413
              precision    recall  f1-score   support

           0       0.83      0.73      0.78       200
           1       0.76      0.85      0.80       200

    accuracy                           0.79       400
   macro avg       0.80      0.79      0.79       400
weighted avg       0.80      0.79      0.79       400



#Minibatch training

In [13]:


# class LogisticRegression ():
#   def __init__ (lr = '', n_iter = 10):
lr = 0.1
n_iter = 10
weight = None
bias = None
def computing_MiniBatch_gradient (X, Y, batch_size):
  n_instances, n_features = X.shape
  weight = np.zeros(n_features)
  bias = 0
  for iter in range(n_iter):
    i=0 
    while i< round(n_instances/batch_size):
      m = batch_size * i
      n = m + batch_size
      sum_w = 0
      sum_b = 0
      for x,y in zip (X[m:n].values,Y[m:n]):
        z = np.dot(weight, x) + bias
        y_pred = sigmoid(z)
        #loss = -(y*log(y_pred)+(1-y)*log(1-y_pred))
        sum_w += np.dot((y_pred - y), x)
        sum_b += (y_pred - y)
      d_weight = (1/batch_size)*sum_w
      d_bias = (1/batch_size)*sum_b
      weight -= lr*d_weight
      bias -= lr*d_bias
      i+=1
  return  weight, bias


weight, bias = computing_MiniBatch_gradient (X_train, y_train, batch_size = 32)
y_predict = predict(X_test,weight, bias)

# Model performing
## on training set
print('Model performance on training set:')
printing_eval_scores (y_train, predict(X_train,weight, bias))

## on test set
print('\n===========================')
print('Model performance on test set:')
printing_eval_scores (y_test, y_predict)


    

Model performance on training set:
accuracy score: 0.945
precision score: 0.9129930394431555
recall score: 0.98375
F1 score: 0.9470517448856799
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       800
           1       0.91      0.98      0.95       800

    accuracy                           0.94      1600
   macro avg       0.95      0.95      0.94      1600
weighted avg       0.95      0.94      0.94      1600


Model performance on test set:
accuracy score: 0.83
precision score: 0.7844827586206896
recall score: 0.91
F1 score: 0.8425925925925926
              precision    recall  f1-score   support

           0       0.89      0.75      0.82       200
           1       0.78      0.91      0.84       200

    accuracy                           0.83       400
   macro avg       0.84      0.83      0.83       400
weighted avg       0.84      0.83      0.83       400



# L2 Regularization

In [14]:


# class LogisticRegression ():
#   def __init__ (lr = '', n_iter = 10):
lr = 0.1
n_iter = 10
alpha = 0.01
weight = None
bias = None
def MiniBatch_gradient_L2 (X, Y, batch_size):
  n_instances, n_features = X.shape
  weight = np.zeros(n_features)
  bias = 0
  for iter in range(n_iter):
    i=0 
    while i< round(n_instances/batch_size):
      m = batch_size * i
      n = m + batch_size
      sum_w = 0
      sum_b = 0
      for x,y in zip (X[m:n].values,Y[m:n]):
        z = np.dot(weight, x) + bias
        y_pred = sigmoid(z)
        w_i = np.dot((y_pred - y), x)
        b_i = y_pred - y
        sum_w +=  w_i + (alpha * w_i) # L2
        sum_b += b_i
      d_weight = (1/batch_size)*sum_w
      d_bias = (1/batch_size)*sum_b
      weight -= lr*d_weight
      bias -= lr*d_bias
      i+=1
  return  weight, bias


weight, bias = MiniBatch_gradient_L2 (X_train, y_train, batch_size = 32)
y_predict = predict(X_test,weight, bias)

# Model performing
## on training set
print('Model performance on training set:')
printing_eval_scores (y_train, predict(X_train,weight, bias))

## on test set
print('\n===========================')
print('Model performance on test set:')
printing_eval_scores (y_test, y_predict)


    

Model performance on training set:
accuracy score: 0.945625
precision score: 0.9140534262485482
recall score: 0.98375
F1 score: 0.9476219145093318
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       800
           1       0.91      0.98      0.95       800

    accuracy                           0.95      1600
   macro avg       0.95      0.95      0.95      1600
weighted avg       0.95      0.95      0.95      1600


Model performance on test set:
accuracy score: 0.83
precision score: 0.7844827586206896
recall score: 0.91
F1 score: 0.8425925925925926
              precision    recall  f1-score   support

           0       0.89      0.75      0.82       200
           1       0.78      0.91      0.84       200

    accuracy                           0.83       400
   macro avg       0.84      0.83      0.83       400
weighted avg       0.84      0.83      0.83       400



In [15]:
for i in range len(weight):
  print 

array([ 5.58907413e-02,  3.80273237e-04,  9.50347879e-02, -4.40981040e-02,
       -1.53685172e-03, -7.48106616e-03, -4.26559917e-03,  5.75626981e-02,
       -2.87654406e-02, -1.86068559e-01,  2.42226336e-01,  9.15305973e-02,
       -3.79939309e-02,  2.48135752e-01, -2.66829862e-01, -7.82549980e-02,
        2.58367160e-01,  3.88781660e-02,  5.61460748e-02, -1.13192114e-01,
       -2.80581094e-01,  3.03664293e-02, -4.60233738e-02,  2.23658146e-01,
        2.25305257e-01, -1.41277377e-01,  2.27884659e-01,  3.02122010e-02,
        1.07844591e-01,  1.61570892e-01,  7.46093366e-03, -2.65159080e-01,
        6.03463870e-02,  2.77356641e-01,  8.28307660e-03,  1.77902968e-01,
        1.82325550e-01,  6.86963264e-02, -3.00127403e-01, -5.78826354e-01,
        1.23308668e-02, -4.31633579e-02, -1.72382071e-01, -7.98117365e-02,
        1.20582836e-01, -4.69150608e-02,  4.00167608e-02, -1.06761182e-01,
        1.24303976e-01,  6.01631582e-02, -8.67966179e-02, -1.36044635e-02,
       -1.24155764e-01, -