# Implementation of evaluation approaches that based on LSA to evaluate the quality of summary by comparing the summary to its full text from an angle of main topics. 

In [1]:
#import libraries 
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
!pip install stem
!pip install stemming

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stem
  Downloading stem-1.8.1.tar.gz (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: stem
  Building wheel for stem (setup.py) ... [?25l[?25hdone
  Created wheel for stem: filename=stem-1.8.1-py3-none-any.whl size=436323 sha256=ae0befc238da7002a64e52c19d58107d26da6ca8e4ea1163dd1717cdc77057dd
  Stored in directory: /root/.cache/pip/wheels/62/e0/c4/41321ddf38be53fbc18a4d739edaef12f79a29e69920d3270d
Successfully built stem
Installing collected packages: stem
Successfully installed stem-1.8.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stemming
  Downloading stemming-1.0.1.zip (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Bu

In [3]:
from stemming.porter2 import stem

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
data = pd.read_csv('/content/drive/MyDrive/Tweets.csv') 
data = data[['text','airline_sentiment', 'airline']]

# Original negative tweets document of Virgin America airline 

-------------------------------------
- We can feed any full document of an existing summary 

In [6]:
#Negative_tweets_related_to_Delta
neg_tweets = data [(data ['airline_sentiment'] == "negative") & (data['airline'] == "Virgin America")]
#neg_tweets.head ()

# Document Cleaning 

In [7]:
def clean_train_data(x):
    processed_feature = x
    # remove all single characters
    processed_feature = re.sub(r'\W', ' ', processed_feature)
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    processed_feature = re.sub ("VirginAmerica"," " ,processed_feature)  
    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    return processed_feature

In [8]:
#Remove stopwords 
stop = set (stopwords.words ("english"))
def remove_stopwords (text): 
  text = [word.lower () for word in text.split() if word.lower() not in stop]
  return " ".join(text)

In [9]:
#remove noisy features 
neg_tweets['text'] = neg_tweets.text.apply(lambda x : clean_train_data(x))
neg_tweets.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_tweets['text'] = neg_tweets.text.apply(lambda x : clean_train_data(x))


Unnamed: 0,text,airline_sentiment,airline
3,it really aggressive to blast obnoxious ent...,negative,Virgin America
4,and it a really big bad thing about it,negative,Virgin America
5,seriously would pay 30 flight for seats tha...,negative,Virgin America
15,sfo pdx schedule is still mia,negative,Virgin America
17,flew from nyc to sfo last week and couldn f...,negative,Virgin America


In [10]:
#remove stopword 
negative_tweets = neg_tweets['text'].map(remove_stopwords)
#negative_tweets

In [11]:
#convert document to list 
negative_tweets = negative_tweets.tolist()
type(negative_tweets)

list

In [12]:
#stem words document 
full_document = [" ".join([stem(word) for word in sentence.split(" ")]) for sentence in negative_tweets]
full_document[1]

'realli big bad thing'

# Term-Frequency Matrix

In [13]:
#Create term-frquency matrix to the document 
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
vector_matrix = count_vectorizer.fit_transform(full_document)
vector_matrix

<181x778 sparse matrix of type '<class 'numpy.int64'>'
	with 1763 stored elements in Compressed Sparse Row format>

In [14]:
tokens = count_vectorizer.get_feature_names()
#tokens



In [15]:
full_document= vector_matrix.toarray()
full_document.shape

(181, 778)

In [16]:
full__document = full_document.transpose()
full__document.shape

(778, 181)

# Summary of negative tweets of VA airline 

- We extract sentences summary (generated by LSA summarizer) which are subset of its full document by their indexes in the full document matrix 
- we can feed any summary in the same manner(by index of each sentence in its full document)

In [46]:
summ_negative = np.array(( full_document[61], full_document[37], full_document[91], full_document[28], full_document[82], full_document[81], full_document[24],
                          full_document[87], full_document[86], full_document[53], full_document[36], full_document[173], full_document[35], 
                          full_document[154], full_document[75], full_document[6], full_document[42], full_document[103], full_document[130], full_document[54])) 

In [47]:
summary = summ_negative.transpose()
summary.shape

(778, 20)

### Apllying SVD on Term-frequency matrix of the orginal document 

In [48]:
from numpy import array
from scipy.linalg import svd
U_neg, s_neg, VT_neg = svd(full__document) #svd matrix
print("U_neg is equal to") # U give the weights that would match each of the word belonging to the topic.
print(U_neg)
print("S_neg is ") # gives weightage of 1st topic and 2nd topic. 
print(s_neg)
print("VT_neg is ") # VT gives weightage of the documents belonging to each of the topic.
print(VT_neg)

U_neg is equal to
[[-1.45798577e-03  5.12685294e-03 -1.70499597e-03 ...  5.58426481e-03
   6.14178342e-03 -8.01360319e-04]
 [-1.85481544e-02  4.51662135e-02  1.50755472e-02 ...  2.63952129e-02
  -1.00007444e-02  3.86949052e-05]
 [-1.45619326e-02 -9.67353149e-03 -7.27425547e-03 ...  1.30283653e-03
  -4.27843231e-05 -1.38698641e-03]
 ...
 [-2.78025632e-03  1.10512655e-02  2.26443221e-03 ...  7.48733971e-01
   2.33793861e-03  6.31785090e-03]
 [-8.27794004e-03 -8.96160473e-03  5.03895273e-05 ...  6.00760913e-04
   8.52171210e-01  1.79340884e-03]
 [-1.38711233e-02 -8.46102251e-03  7.08400677e-04 ...  7.79803036e-03
   1.16566369e-03  9.24147562e-01]]
S_neg is 
[12.34270171  7.59148325  6.99325782  6.51834605  6.16488108  5.76832687
  5.63278256  5.59829036  5.3931091   5.34910645  5.2000273   5.19034169
  5.07604701  4.93390268  4.89962606  4.7775803   4.69834018  4.68948528
  4.65862751  4.60247796  4.54659323  4.50441807  4.49387865  4.39179813
  4.33516748  4.30087048  4.29393641  4.2518

### Applying SVD on term-frequency matrix of the summary

In [49]:
from numpy import array
from scipy.linalg import svd
U_neg1, s_neg1, VT_neg1 = svd(summary) #svd matrix
print("U_neg1 is equal to") # U give the weights that would match each of the word belonging to the topic.
print(U_neg1)
print("S_neg1 is ") # gives weightage of 1st topic and 2nd topic. 
print(s_neg1)
print("VT_neg1 is ") # VT gives weightage of the documents belonging to each of the topic.
print(VT_neg1)

U_neg1 is equal to
[[ 1.51405371e-16  9.72193250e-19 -3.20324269e-17 ...  0.00000000e+00
   0.00000000e+00  5.63785130e-18]
 [-2.69338970e-17 -1.89481267e-18 -5.03516099e-17 ...  0.00000000e+00
   0.00000000e+00 -3.25260652e-19]
 [-2.50292923e-02  1.88008477e-02  1.27839790e-02 ...  0.00000000e+00
   0.00000000e+00  1.37763658e-03]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  1.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [-2.41446121e-02 -3.71000048e-03  1.32735830e-02 ...  0.00000000e+00
   0.00000000e+00  9.42566946e-01]]
S_neg1 is 
[9.5664712  4.98031534 4.68842146 4.44120941 4.2017177  4.12699012
 3.43726894 3.31056905 3.29933267 3.07816031 2.9814135  2.91631443
 2.73103439 2.61024026 2.45290284 2.40129711 2.34400727 2.28382725
 2.18384276 1.79544817]
VT_neg1 is 
[[-3.53951490e-01 -3.30971818e-01 -3.03852557e-01 -2.52627709e-01
  -2.43224496e-01 -2.46964585e-

In [50]:
from numpy import zeros
from numpy import diag
# create m x n Sigma matrix
Sigma_neg = zeros((full__document.shape[0], full__document.shape[1])) #original
# populate Sigma with n x n diagonal matrix
Sigma_neg [:full__document.shape[1], :full__document.shape[1]] = diag(s_neg)
# reconstruct matrix
B1 = U_neg.dot(Sigma_neg.dot(VT_neg))
Sigma_neg

array([[12.34270171,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  7.59148325,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  6.99325782, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [51]:
# create m x n Sigma matrix
Sigma_neg1 = zeros((summary.shape[0], summary.shape[1])) #summary
# populate Sigma with n x n diagonal matrix
Sigma_neg1[:summary.shape[1], :summary.shape[1]] = diag(s_neg1)
# reconstruct matrix
B2 = U_neg1.dot(Sigma_neg1.dot(VT_neg1))
#print(B2)
#B1.shape 

In [52]:
Sigma1_sq = np.square(Sigma_neg)
Sigma2_sq = np.square(Sigma_neg1)

In [53]:
Original_U_neg = np.dot(U_neg, Sigma1_sq) #np.square
ou_neg = np.sqrt(np.square(Original_U_neg))
Original_U_neg.shape

(778, 181)

In [54]:
Summary_U_neg = np.dot(U_neg1, Sigma2_sq)
su_neg = np.sqrt(np.square(Summary_U_neg))
Summary_U_neg.shape

(778, 20)

In [55]:
O_T_neg =Original_U_neg.transpose()
O_T_neg1 = ou_neg.transpose()
O_T_neg.shape

(181, 778)

In [56]:
S_T_neg = Summary_U_neg.transpose()
S_T_neg1 = su_neg.transpose()
S_T_neg.shape

(20, 778)

# Top Topics Similarity 

In [57]:
topic1_neg = cosine_similarity ([O_T_neg1[0]], [S_T_neg1[0]])
topic2_neg = cosine_similarity ([O_T_neg1[1]], [S_T_neg1[1]])
topic3_neg = cosine_similarity ([O_T_neg1[2]], [S_T_neg1[2]])
topic4_neg = cosine_similarity ([O_T_neg1[3]], [S_T_neg1[3]])
topic5_neg = cosine_similarity ([O_T_neg1[4]], [S_T_neg1[4]])

In [58]:
print("Similarity of topic1") 
print(topic1_neg)
print("Similarity of topic2")
print(topic2_neg)
print("Similarity of topic3") 
print(topic3_neg) 
print("Similarity of topic4") 
print(topic4_neg)
print("Similarity of topic5")
print(topic5_neg)

Similarity of topic1
[[0.94715807]]
Similarity of topic2
[[0.29705655]]
Similarity of topic3
[[0.12914065]]
Similarity of topic4
[[0.35161918]]
Similarity of topic5
[[0.3849106]]


# Term Significance Similarity 

In [30]:
av_neg = np.sqrt(np.sum(np.square(Original_U_neg), axis=1)) #0 vertical #1 horizonital 
av_neg = av_neg.reshape(1, -1)
av_neg

array([[  3.16227766,   8.24621125,   5.47722558,   4.        ,
          4.47213595,   3.        ,   3.87298335,   3.87298335,
          3.16227766,   3.16227766,   3.46410162,   3.87298335,
          3.60555128,   5.47722558,   3.16227766,   4.        ,
          4.35889894,   4.24264069,   3.46410162,   3.        ,
          3.16227766,   7.81024968,   4.35889894,   3.46410162,
          3.        ,   3.46410162,   3.46410162,   6.8556546 ,
          5.29150262,   2.64575131,   6.        ,   8.71779789,
          3.74165739,   4.        ,   3.16227766,   3.60555128,
          3.60555128,   3.46410162,   3.87298335,   4.24264069,
          3.87298335,   3.46410162,   4.12310563,   3.31662479,
          3.74165739,   5.91607978,   3.31662479,   3.16227766,
          3.74165739,   6.164414  ,   4.12310563,   4.12310563,
          9.11043358,   8.30662386,   3.87298335,   4.47213595,
          4.35889894,   4.        ,   3.31662479,   4.        ,
          2.64575131,   5.38516481,   3.

In [31]:
av1_neg = np.sqrt(np.sum(np.square(Summary_U_neg), axis=1)) # square-root (sqrt)  - square value (square)
av1_neg = av1_neg.reshape(1, -1)
av1_neg

array([[1.39925639e-14, 3.43251410e-15, 3.46410162e+00, 1.32670450e-15,
        7.79823744e-16, 8.02979326e-16, 1.07566810e-15, 1.13308411e-15,
        1.07401575e-15, 5.75900908e-16, 4.42272704e-16, 2.28140723e-16,
        5.42152291e-17, 1.60763437e-16, 2.35828976e-16, 4.94234993e-16,
        4.63097676e-16, 5.57709098e-17, 3.33600670e-17, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        3.74165739e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 4.12310563e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.472135

In [32]:
simi_neg = cosine_similarity (av_neg, av1_neg)
simi_neg

array([[0.77261848]])