<a href="https://colab.research.google.com/github/HuyenNguyenHelen/LING-5412/blob/main/Assignment2_LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries that will be used 
import numpy as np
import tarfile
import glob
import re
import pandas as pd
#from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Loading the data

In [2]:
# Untar the dataset
my_tar = tarfile.open('/content/review_polarity.tar.gz')
my_tar.extractall('/content/') 
my_tar.close()


In [3]:
# Exploring the data sizes

paths_pos = glob.glob('/content/txt_sentoken/pos/*.txt')
paths_neg = glob.glob('/content/txt_sentoken/neg/*.txt')
pos_neg_paths = paths_pos + paths_neg

n_pos = len(paths_pos)
n_neg = len(paths_neg)

print('the number of positive instances: {} \nthe number of positive instances: {}'.format(n_pos, n_neg))

the number of positive instances: 1000 
the number of positive instances: 1000


# Exploring the data

In [4]:
# Exploring the words in the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

def tokenizer (doc):
  return re.split("\W+", doc)   # return a list of tokens without punctuations

def stopword_remover (bow):
  filtered_bow = [w for w in bow if not w.lower() in stopwords]
  return filtered_bow

def top_freq_w (freq_dic, top_n, stopword_removing = ''):
  sorted_dic = {k:v for k, v in sorted(freq_dic.items(), key = lambda item: item[1], reverse=True)}
  if stopword_removing is False:
    return {k:v for k, v in list(sorted_dic.items())[:top_n]}
  elif stopword_removing is True:
    filtered_dic = {k: v for k, v in sorted_dic.items() if k not in stopwords}
    return {k:v for k, v in list(filtered_dic.items())[:top_n]}
  

word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  for token in tokenizer (doc):
    word_freq[token] = word_freq.get(token,0)+1

top_100_w = top_freq_w(word_freq, 100, stopword_removing = False) 

print('\nthe number of unique words in the dataset: ', len(word_freq.keys()))
print ('\ntop 100 most frequent words:\n', top_100_w )
print('\nthe number of words in the top 100 which are stopwords: ', len([w for w in top_100_w.keys() if w in stopwords]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

the number of unique words in the dataset:  39697

top 100 most frequent words:
 {'the': 76529, 'a': 38106, 'and': 35576, 'of': 34123, 'to': 31937, 'is': 25195, 'in': 21822, 's': 18513, 'it': 16107, 'that': 15924, 'as': 11378, 'with': 10792, 'for': 9961, 'his': 9587, 'this': 9578, 'film': 9517, 'i': 8889, 'he': 8864, 'but': 8634, 'on': 7385, 'are': 6949, 't': 6410, 'by': 6261, 'be': 6174, 'one': 5852, 'movie': 5771, 'an': 5744, 'who': 5692, 'not': 5577, 'you': 5316, 'from': 4999, 'at': 4986, 'was': 4940, 'have': 4901, 'they': 4825, 'has': 4719, 'her': 4522, 'all': 4373, 'there': 3770, 'like': 3690, 'so': 3683, 'out': 3637, 'about': 3523, 'up': 3405, 'more': 3347, 'what': 3322, 'when': 3258, 'which': 3161, 'or': 3148, 'she': 3141, 'their': 3122, 'some': 2985, 'just': 2905, 'can': 2882, 'if': 2799, 'we': 2775, 'him': 2633, 'into': 2623, 'even': 2565, 'only': 2495, 'than

In [5]:
# Reformating the dataset into csv for convenience 

def to_df (folder):
  data_dic = {}
  data_dic['doc'], data_dic['label'] = [], []
  for file in folder:
    fo = open(file)
    doc = fo.read()
    data_dic['doc'].append(doc)
    if 'pos' in file:
      data_dic['label'].append(1)
    elif 'neg' in file:
      data_dic['label'].append(0)
    else:
      print('error', file)
  df = pd.DataFrame.from_dict(data_dic)
  return df
    
data = to_df(pos_neg_paths)

data.head()

Unnamed: 0,doc,label
0,"in the wake of the smashing success of "" rumbl...",1
1,"ingredients : little orphan boy , rural grandp...",1
2,after watching the first ten minutes of this j...,1
3,"allen , star of many a brian depalma movie in ...",1
4,"in "" the sweet hereafter , "" writer/director a...",1


# Data preprocessing

In [6]:
# Data preprocessing
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def preprocessor (text):
  ## removing punctuations and characters
  text = re.sub(r'[^\w\s]', '', text)
  # stripping
  text = ' '.join([w.strip() for w in text.split()])
  ## lowcasing
  text = text.lower()
  # ## removing stopword
  text = stopword_remover (text.split())
  # ##stemmming
  text = [stemmer.stem(w) for w in text]
  # ## lematization
  text = [lemmatizer.lemmatize(w) for w in text]
  return ' '.join([w for w in text])

data['doc'] = data['doc'].apply(lambda x:  preprocessor (x) )
data 

Unnamed: 0,doc,label
0,wake smash success rumbl bronx look like jacki...,1
1,ingredi littl orphan boy rural grandpar mounta...,1
2,watch first ten minut japanes film never eat b...,1
3,allen star mani brian depalma movi earli eight...,1
4,sweet hereaft writerdirector atom egoyan take ...,1
...,...,...
1995,last carri movi discount carri columbu carri m...,0
1996,one indic bad film hype rememb film case box h...,0
1997,film mean well pushi promot belabor point sent...,0
1998,start littl mermaid recent lion king walt disn...,0


# Building a Logistic Regression Model

### Preparing vocabulary/feature extraction

In [7]:

## As required, we will use 1000 most frequent word, excluding stopwords

cleaned_word_freq = {}
for path in pos_neg_paths:
  fo = open(path)
  doc = fo.read()
  # Cleaning documents before extracting features
  cleaned_doc = preprocessor(doc)
  # Getting terms and their frequency 
  for token in tokenizer (cleaned_doc):
    cleaned_word_freq[token] = cleaned_word_freq.get(token,0)+1

# Getting 1000 terms with highest frequency, excluding stopwords
vocabulary = top_freq_w(cleaned_word_freq, 1000, stopword_removing = True) 


### Representing documents based on extracted features


In [8]:
# binary representing documents based on occurrance of features in documents
def doc_representor (doc):
  doc_vec = []
  token_list = tokenizer (doc)
  for feature in vocabulary.keys():
    if feature in token_list:
      doc_vec.append(1)
    else:
       doc_vec.append(0)
  return doc_vec

X = data['doc'].apply(lambda x: doc_representor(x))
y = data['label']

In [9]:
# Visualize the data after representing
X = X.apply(pd.Series)
X.columns = vocabulary.keys()
print(X.shape)
X

(2000, 1000)


Unnamed: 0,film,movi,one,like,charact,get,make,time,scene,even,good,play,stori,see,would,much,also,go,way,seem,look,end,two,take,first,come,well,work,thing,year,realli,plot,know,perform,littl,life,peopl,love,could,bad,...,lover,island,scare,spent,cinematographi,agre,manner,command,standard,menac,adam,front,fairli,budget,ground,brown,appropri,pair,disturb,virtual,connect,suddenli,grant,fantasi,90,godzilla,cultur,cameo,count,store,brief,cute,key,fascin,greatest,trip,bug,foot,addit,satir
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,...,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,1,0,0,1,1,0,0,1,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,0,0,1,1,1,0,0,0,0,1,1,1,0,1,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
3,1,1,1,1,0,1,1,1,1,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,0,1,1,0,0,0,0,1,0,1,1,0,1,1,0,0,1,1,0,0,0,1,0,1,1,0,1,1,1,1,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,1,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1996,1,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,0,1,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
1997,1,1,1,0,1,1,1,1,1,0,0,0,1,1,1,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,0,0,1,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,1,1,1,1,1,0,1,1,0,1,1,0,1,1,1,1,0,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
# Spliting the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split (X[vocabulary.keys()], y , train_size = 0.8, random_state = 42, shuffle = True, stratify=data['label'])
print ('Shapes of X_train, y_train: ', X_train.shape, y_train.shape)
print ('Shapes of X_test, y_test: ', X_test.shape, y_test.shape)

Shapes of X_train, y_train:  (1600, 1000) (1600,)
Shapes of X_test, y_test:  (400, 1000) (400,)


### Training

In [11]:
# Writing functions

def sigmoid (z):
  p=1/(1+np.exp(-z))
  return p


def predict(X, weight, bias):
    z = np.dot(weight, X.values.T) + bias
    y_pred = sigmoid(z)
    y_class = [1 if i >= 0.5 else 0 for i in y_pred]
    return y_class

def printing_eval_scores (y_true, y_pred, report=''):
  accuracy = sklearn.metrics.accuracy_score(y_true, y_pred)
  precision = sklearn.metrics.precision_score(y_true, y_pred)
  recall = sklearn.metrics.recall_score(y_true, y_pred)
  f1 = sklearn.metrics.f1_score(y_true, y_pred)
  print('accuracy score: {:.3f}'.format(accuracy))
  print('precision score: {:.3f}'.format(precision))
  print('recall score: {:.3f}'.format(recall))
  print('F1 score: {:.3f}'.format(f1))
  if report is True:
    print(classification_report(y_true, y_pred))
  else:
    pass
  return accuracy, precision, recall, f1
  


def computing_gradient (X, Y, weight, bias):
  for x,y in zip (X.values, Y):
    z = np.dot(weight, x) + bias
    y_pred = sigmoid(z)
    #loss = -(y*log(y_pred)+(1-y)*log(1-y_pred))
    d_weight = np.dot((y_pred - y), x)
    d_bias = (y_pred - y)
    weight -= lr*d_weight
    bias -= lr*d_bias
  return weight, bias

def computing_MiniBatch_gradient (X, Y, weight, bias, batch_size):
  n_instances, n_features = X.shape
  i=0 
  while i<= round(n_instances/batch_size):
    m = batch_size * i
    n = m + batch_size
    sum_w = 0
    sum_b = 0
    for x,y in zip (X[m:n].values,Y[m:n]):
      z = np.dot(weight, x) + bias
      y_pred = sigmoid(z)
      #loss = -(y*log(y_pred)+(1-y)*log(1-y_pred))
      sum_w += np.dot((y_pred - y), x)
      sum_b += (y_pred - y)
    d_weight = (1/batch_size)*sum_w
    d_bias = (1/batch_size)*sum_b
    weight -= lr*d_weight
    bias -= lr*d_bias
    i+=1
  return  weight, bias

def MiniBatch_gradient_L2 (X, Y, weight, bias,alpha, batch_size):
  n_instances, n_features = X.shape
  i=0 
  while i<= round(n_instances/batch_size):
    m = batch_size * i
    n = m + batch_size
    sum_w = 0
    sum_b = 0
    for x,y in zip (X[m:n].values,Y[m:n]):
      z = np.dot(weight, x) + bias
      y_pred = sigmoid(z)
      w_i = np.dot((y_pred - y), x)
      b_i = y_pred - y
      sum_w +=  w_i  + (alpha * weight) # L2
      sum_b += b_i
    d_weight = (1/batch_size)*sum_w 
    d_bias = (1/batch_size)*sum_b
    weight -= lr*d_weight
    bias -= lr*d_bias
    i+=1
  return  weight, bias


#### Initial LR model
updating weight and bias with Gradient descent every instance

In [12]:
# Training the model, updating weight and bias every instances

print('Training LR.......................................................')
lr = 0.1
n_iter = 10
weight = np.zeros(X.shape[1])
bias = 0
for iter in range(n_iter):
  print('\n====================iteration %s=======================' % str(iter+1))
  weight, bias = computing_gradient (X_train, y_train, weight, bias)
  y_predict = predict(X_test,weight, bias)

  ## on training set
  print('Model performance on training set:')
  printing_eval_scores (y_train, predict(X_train,weight, bias), report = False)

  ## on test set
  print('\nModel performance on test set:')
  printing_eval_scores (y_test, y_predict, report = True)




    

Training LR.......................................................

Model performance on training set:
accuracy score: 0.902
precision score: 0.893
recall score: 0.915
F1 score: 0.904

Model performance on test set:
accuracy score: 0.762
precision score: 0.733
recall score: 0.825
F1 score: 0.776
              precision    recall  f1-score   support

           0       0.80      0.70      0.75       200
           1       0.73      0.82      0.78       200

    accuracy                           0.76       400
   macro avg       0.77      0.76      0.76       400
weighted avg       0.77      0.76      0.76       400


Model performance on training set:
accuracy score: 0.932
precision score: 0.959
recall score: 0.902
F1 score: 0.930

Model performance on test set:
accuracy score: 0.757
precision score: 0.775
recall score: 0.725
F1 score: 0.749
              precision    recall  f1-score   support

           0       0.74      0.79      0.77       200
           1       0.78      0.72    

#### Minibatch training
updating weight and bias with Gradient descent every batch-size = 32

In [13]:
lr = 0.1
n_iter = 10
weight = np.zeros(X.shape[1])
bias = 0
for iter in range(n_iter):
  print('\n====================iteration %s=======================' % str(iter+1))
  weight, bias = computing_MiniBatch_gradient (X_train, y_train, weight, bias,  batch_size = 32)
  y_predict = predict(X_test,weight, bias)
  
  # Model performing
  ## on training set
  print('Model performance on training set:')
  printing_eval_scores (y_train, predict(X_train,weight, bias), report = False)

  ## on test set
  print('\nModel performance on test set:')
  printing_eval_scores (y_test, y_predict, report = True)

    


Model performance on training set:
accuracy score: 0.858
precision score: 0.812
recall score: 0.931
F1 score: 0.868

Model performance on test set:
accuracy score: 0.770
precision score: 0.709
recall score: 0.915
F1 score: 0.799
              precision    recall  f1-score   support

           0       0.88      0.62      0.73       200
           1       0.71      0.92      0.80       200

    accuracy                           0.77       400
   macro avg       0.79      0.77      0.77       400
weighted avg       0.79      0.77      0.77       400


Model performance on training set:
accuracy score: 0.879
precision score: 0.849
recall score: 0.922
F1 score: 0.884

Model performance on test set:
accuracy score: 0.795
precision score: 0.746
recall score: 0.895
F1 score: 0.814
              precision    recall  f1-score   support

           0       0.87      0.69      0.77       200
           1       0.75      0.90      0.81       200

    accuracy                           0.80      

#### L2 Regularization
Implementing L2 regulation
Training on minibatch size = 32

In [14]:

lr = 0.1
n_iter = 10
alpha = 0.01
weight = np.zeros(X.shape[1])
bias = 0
for iter in range(n_iter):
  print('\n====================iteration %s=======================' % str(iter+1))
  weight, bias = MiniBatch_gradient_L2 (X_train, y_train, alpha = alpha,  weight=weight, bias=bias, batch_size = 32)
  y_predict = predict(X_test,weight, bias)

  # Model performing
  ## on training set
  print('Model performance on training set:')
  printing_eval_scores (y_train, predict(X_train,weight, bias), report = False)

  ## on test set
  print('\nModel performance on test set:')
  printing_eval_scores (y_test, y_predict, report = True)


    


Model performance on training set:
accuracy score: 0.854
precision score: 0.807
recall score: 0.931
F1 score: 0.865

Model performance on test set:
accuracy score: 0.770
precision score: 0.709
recall score: 0.915
F1 score: 0.799
              precision    recall  f1-score   support

           0       0.88      0.62      0.73       200
           1       0.71      0.92      0.80       200

    accuracy                           0.77       400
   macro avg       0.79      0.77      0.77       400
weighted avg       0.79      0.77      0.77       400


Model performance on training set:
accuracy score: 0.879
precision score: 0.848
recall score: 0.924
F1 score: 0.885

Model performance on test set:
accuracy score: 0.790
precision score: 0.740
recall score: 0.895
F1 score: 0.810
              precision    recall  f1-score   support

           0       0.87      0.69      0.77       200
           1       0.74      0.90      0.81       200

    accuracy                           0.79      

## Observing importance features

In [15]:
# Getting weights of features that have been trained 
## store in a dictionary
feature_weights = {}
for i in range(len(weight)):
  feature_weights[list(vocabulary.keys())[i]] = weight[i]

# Sorting the dictionary in descending order
sorted_feature_weights = {k:v for k, v in sorted(feature_weights.items(), key = lambda item: item[1], reverse=True)}

# Print the weights learned for each class
print('50 most important features of POSITIVE class (in descending order): ')
for k, v in list(sorted_feature_weights.items())[:50]:
  print ('{}: {:.5f}'. format(k,v))

print('\n==============================================')
print('50 most important features of NEGATIVE class (in descending order): ')
for k, v in list(sorted_feature_weights.items())[:-50:-1]: 
  print ('{}: {:.5f}'. format(k,v))

50 most important features of POSITIVE class (in descending order): 
hilari: 0.34366
also: 0.31770
enjoy: 0.31536
sometim: 0.29092
great: 0.28758
job: 0.27214
definit: 0.26284
memor: 0.26126
detail: 0.25713
american: 0.24504
town: 0.23757
flaw: 0.23496
thank: 0.23475
overal: 0.23364
perfect: 0.23317
excel: 0.23114
differ: 0.22875
peopl: 0.22551
life: 0.22498
perfectli: 0.22478
extrem: 0.22446
especi: 0.22370
quit: 0.21631
intens: 0.21435
equal: 0.21363
simpl: 0.21202
brilliant: 0.20991
mani: 0.20946
support: 0.20518
although: 0.20270
entertain: 0.20231
view: 0.20026
oscar: 0.19661
fun: 0.19367
portray: 0.19331
follow: 0.19329
surpris: 0.19282
perform: 0.19209
fiction: 0.18957
normal: 0.18723
best: 0.18224
seen: 0.17853
throughout: 0.17761
deserv: 0.17689
prove: 0.17525
ben: 0.17513
delight: 0.17475
deliv: 0.17467
david: 0.17462
rais: 0.16749

50 most important features of NEGATIVE class (in descending order): 
bad: -0.55933
worst: -0.46728
bore: -0.45559
wast: -0.45004
noth: -0.40442
p

# Checking Sklearn logistic regression model

In [16]:
# Sklearn Logistic Regression Model
from sklearn.linear_model import LogisticRegression
sk_lr = LogisticRegression(solver='lbfgs', max_iter=150).fit(X_train, y_train )
y_predict = sk_lr.predict(X_test)

# Model performing
## on training set
print('Model performance on training set:')
printing_eval_scores (y_train, sk_lr.predict(X_train))

## on test set
print('\n===========================')
print('Model performance on test set:')
printing_eval_scores (y_test, y_predict, report = True)

Model performance on training set:
accuracy score: 1.000
precision score: 1.000
recall score: 1.000
F1 score: 1.000

Model performance on test set:
accuracy score: 0.787
precision score: 0.778
recall score: 0.805
F1 score: 0.791
              precision    recall  f1-score   support

           0       0.80      0.77      0.78       200
           1       0.78      0.81      0.79       200

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400



(0.7875, 0.7777777777777778, 0.805, 0.7911547911547911)