##### Author: Krishna Prasanna Bhamidipati (kbhamid2)

In [1]:
#Load the required libraries
import os
import numpy as np
import pandas as pd
import re,string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras 
from numpy import loadtxt
from xgboost import XGBClassifier
from sentence_transformers import SentenceTransformer

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
!pip install -U sentence-transformers

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### LOADING AND UNDERSTANDING DATASET
In the below two cells, training and test datasets have been loaded. No additional data has been used for this project (apart from the word-embedding text files which are loaded along with word-embedding pre-trained models). 
Basic commands have been run to understand the size of the data, the columns and data-types there are. 

In [3]:
# Path to the NGSIM data
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/My Drive/Colab Notebooks/CSC791 NATURAL LANGUAGE PROCESSING/

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks/CSC791 NATURAL LANGUAGE PROCESSING


In [4]:
#Import the datasets
train_data = pd.read_excel('P1_labeled_dataset.xlsx')
test_data = pd.read_excel('P1_test_dataset.xlsx')

In [5]:
#Running basic sanity checks:
print("Train data shape ", train_data.shape)
print("Test data shape ", test_data.shape, '\n')

print("Train data columns and datatypes: ")
print(train_data.dtypes)

train_data.head(10)

Train data shape  (4829, 6)
Test data shape  (1000, 6) 

Train data columns and datatypes: 
sentId             int64
sentence          object
comparison         int64
preferred         object
mentioned_apps    object
current_app       object
dtype: object


Unnamed: 0,sentId,sentence,comparison,preferred,mentioned_apps,current_app
0,0,Might need to switch to Spotify...,1,O,['Spotify'],Pandora
1,1,Instagram is a copy cat btw.,1,T,['instagram'],Snapchat
2,2,Way better than pandora.,1,T,['Pandora'],Spotify
3,3,one of two (chase) banking apps i’ve never had...,2,T,['Chase Mobile®'],Citi Mobile®
4,4,"It's awesome, I got it recommended by a friend...",2,T,['Temple Run 2'],Subway Surfers
5,5,Way better compared to sound cloud and pandora...,1,T,['Pandora'],Spotify
6,6,"No problems with crashing, and more expansive ...",1,T,"['NYTimes – Breaking Politics, National & Worl...",The Washington Post Classic
7,7,it's better than pandora,2,T,['Pandora'],Spotify
8,8,Compare Bank of America mobile app,1,N,['Bank of America - Mobile Banking'],Wells Fargo Mobile
9,9,"the update, occurring on December 17, 2018, wi...",2,T,['instagram'],Tumblr


### PREPROCESSING --- SENTENCE CLEANING AND TRANSFORMATION
In the three cells below we perform following steps:
1. A new column called cleaned_sentence is created which is devoid of punctuation. It contains only lower-case words without any leading/trailing whitespaces. 
2. Cleaned sentences are tokenized

    <font color = 'Orange'> Note: I have tried nltk's word_tokenize method,but it does not have a built-in capability to remove punctuations. Hence I first substituted for all punctuations with a '' character using regex sub method. Then perform simple string split function to split the sentence into tokens. </font> 

3. The preferred column consits of categorical labels which the model will not allow. They are mapped to numerical labels. 

##### Train data

In [6]:
#Tokenize labeled data:-- Take each row and generate corresponding tokens 
#Make words in the sentence lower case and also remove white spaces. 
#CreateD an additional column called as 'clean sentence' - the word embedding techniques are implemented on this. 
train_data['cleaned_sentence'] = train_data['sentence'].apply(lambda r: re.sub(r'[^\w\s]', '', str(r).lower().strip()))
train_data['tokenized_sent']  = train_data['cleaned_sentence'].apply(lambda r: r.split())                                      

mapping_dict = {'T':1,'O':2,'N':3}
train_data['preferred'] = train_data['preferred'].map(mapping_dict)


train_data.head()

Unnamed: 0,sentId,sentence,comparison,preferred,mentioned_apps,current_app,cleaned_sentence,tokenized_sent
0,0,Might need to switch to Spotify...,1,2,['Spotify'],Pandora,might need to switch to spotify,"[might, need, to, switch, to, spotify]"
1,1,Instagram is a copy cat btw.,1,1,['instagram'],Snapchat,instagram is a copy cat btw,"[instagram, is, a, copy, cat, btw]"
2,2,Way better than pandora.,1,1,['Pandora'],Spotify,way better than pandora,"[way, better, than, pandora]"
3,3,one of two (chase) banking apps i’ve never had...,2,1,['Chase Mobile®'],Citi Mobile®,one of two chase banking apps ive never had an...,"[one, of, two, chase, banking, apps, ive, neve..."
4,4,"It's awesome, I got it recommended by a friend...",2,1,['Temple Run 2'],Subway Surfers,its awesome i got it recommended by a friend a...,"[its, awesome, i, got, it, recommended, by, a,..."


##### Test Data

In [7]:
#Tokenize labeled data:-- Take each row and generate corresponding tokens 
#Make words in the sentence lower case and also remove white spaces. 
#CreateD an additional column called as 'clean sentence' - the word embedding techniques are implemented on this. 
test_data['cleaned_sentence'] = test_data['sentence'].apply(lambda r: re.sub(r'[^\w\s]', '', str(r).lower().strip()))
test_data['tokenized_sent']  = test_data['cleaned_sentence'].apply(lambda r: r.split())                                      

mapping_dict = {'T':1,'O':2,'N':3}
reverse_mapping_dict = {1:'T',2:'O',3:'N'}
test_data['preferred'] = test_data['preferred'].map(mapping_dict)

In [8]:
print("*" * 100)
print("BASELINE MODEL STARTS HERE")
print("*" * 100)

****************************************************************************************************
BASELINE MODEL STARTS HERE
****************************************************************************************************


# <FONT COLOR = "RED"> BASELINE MODEL </FONT>

### GENERATING WORD AND SENTENCE EMBEDDINGS

1. The embedding models used here in building the base-line models are spaCY and TF-IDF vectorization techniques. 
2. These embedding models are different from one another -- while spaCy is a pre-trained model with one of the fastest execution speeds, TF-IDF word embeddings are generated by exploiting the text-corpus we have provided to it (it is not a pre-trained model). 
3. Other embedding techniques like word2vec, glove were also tested in baseline model but they performed poorly for selected choice of evaluation metrics. 

<font color = "Orange"> Note: FOR OUR BASELINE MODEL, WE USE ONLY THE `CLEANED_SENTENCE` AND `TOKENIZED_SENT` COLUMNS. *** I.E. LEMMATIZED TOKENS AND THOSE TREATED FOR STOP-WORDS HAVE NOT BEEN USED FOR THIS TASK. ***
    HOWEVER, THE WORDS ARE ALL LOWER-CASE AND PUNCTUATION HAS BEEN REMOVED WHERE UNNECESSARY. </font>

In [9]:
# LOADING AND INITIALIZING WORD-EMBEDDING MODELS.
# Load the spacy model
# The spacy model is applied directly on the sentence itself.
# It automatically computes the mean of individual words' embeddings in a sentence to get the corresponding sentence embedding.
nlp = spacy.load('en_core_web_sm')

## Tf-Idf vectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2))

#### NOTE ON SPACY AND TFIDF EMBEDDINGS


spaCy embeddings model applies on the entire sentence. i.e. it generates word-embeddings for individual words in a sentence *IMPLICITLY*, and it automatically generates the sentence embedding by taking an average of all its word-embeddings. 
Similarly, TF-IDF words on the whole text-corpus, and hence it computes the sentence embedding directly. (Each element in a single sentence embeddind corresponds to TF-IDF of the word that the element corresponds to. *IN OTHER WORDS, THE WORD-EMBEDDING SIZE IN TF-IDF IS A SCALAR FLOAT VALUE.*

##### Train Data

In [10]:
#Generate sentence embeddings
train_data['spacy_embeddings'] = train_data['cleaned_sentence'].apply(lambda s: nlp(s).vector)

text_corpus = train_data["cleaned_sentence"]
tfidf_train_matrix = tfidf.fit_transform(text_corpus)
tfidf_train_vocabulary = tfidf.vocabulary_

##### Test Data

In [11]:
test_data['spacy_embeddings'] = test_data['cleaned_sentence'].apply(lambda s: nlp(s).vector)
tfidf_test_matrix = tfidf.transform(test_data["cleaned_sentence"])

### DATASET PREPARATION FOR MODEL
1. This step involves creating a train dataset which consists of only the sentence embeddings generated from above steps. 
2. Two label series are generated - One for preference classification and other for comparison classification. These two are fed as y-labels to our baseline classification algorithms in-turn and accuracy is computed for each. 

<font color = "Orange"> Note: NO TRAIN AND TEST SPLIT HAS BEEN PERFORMED BASED ON MOODLE DISCUSSION. THE ENTIRE DATA HAS BEEN USED FOR TRAINING </font> 

##### Train Data

In [12]:
#Creating individual training datasets
train_spacy_sentences = np.asarray(train_data['spacy_embeddings'].tolist())
train_tfidf_sentences = tfidf_train_matrix.copy()
train_pref_labels = np.asarray(train_data['preferred'].tolist())
train_comp_labels = np.asarray(train_data['comparison'].tolist())

##### Test Data

In [13]:
#Creating individual training datasets
test_spacy_sentences = np.asarray(test_data['spacy_embeddings'].tolist())
test_tfidf_sentences = tfidf_test_matrix.copy()
test_pref_labels = np.asarray(test_data['preferred'].tolist())
test_comp_labels = np.asarray(test_data['comparison'].tolist())

### MODEL SELECTION
The models selected for these tasks are Random Forest classifier and multi-nomial Logistic Regression classifier. These models have been selected because they support multi-class classification which is desired for our tasks.

### EVALUATION METRICS
1. Accuracy score - Since we have multi-class classification, accuracy is a good evaluation metric to gauge how our models are performing. 
2. Confusion matrix - Confusion matrix is being printed out to understand our classification model better and access which classes are being most misclassified. 

### MODEL BUILDING
There will be 8 models run in total -- 4 models for each label such that - 2 embeddings are covered, two classification algorithms are covered. For each task, following steps have been implemented:
1. `random_forest_classification` and `multi_logistic_regression_classificatin` functions train on training data and predict for test data. Each of these functions are called 4 times, for two different set of labels, for two different embedding techniques
2. For both tasks accuracy scores from models are computed,confusion matrices are printed out and other evaluation metrics are displayed.

#### Random Forest Classification

In [14]:
# Fit Random Forest Classification to the training set, predict the class labels for test data.
def random_forest_classification(X_train,y_train,X_test,y_test,task, embedding):
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
    classifier.fit(X_train, y_train)

    # Predict the test set results
    y_pred = classifier.predict(X_test)
    if (task == 'Preferences'):
        y_test = np.vectorize(reverse_mapping_dict.get)(y_test)
        y_pred = np.vectorize(reverse_mapping_dict.get)(y_pred)
   
    # Create Confusion Matrix
    print(f'Random Forest Confusion Matrix for {task} label classification for {embedding} embedding:')
    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
    
    #Print accuracy scores
    print('\n')
    rf_acc_score=accuracy_score(y_test,y_pred)
    print(f'Random Forest Accuracy for {task} label classification for {embedding} embedding: {rf_acc_score}')

    #Print other evaluation scores
    print('\n')
    print(classification_report(y_test, y_pred))
          
    return

#### Multinomial Logistic Regression Classification

In [15]:
# Fit Multinomial Logistic Regression Classification to the training set, predict the class labels for test data.
def multi_logistic_regression_classification(X_train,y_train,X_test,y_test,task,embedding):
    classifier = LogisticRegression (multi_class="multinomial",solver='newton-cg')
    classifier.fit(X_train, y_train)

    # Predict the test set results
    y_pred = classifier.predict(X_test)
    y_pred_probs = classifier.predict_proba(X_test)
    if (task == 'Preferences'):
        y_test = np.vectorize(reverse_mapping_dict.get)(y_test)
        y_pred = np.vectorize(reverse_mapping_dict.get)(y_pred)
    
    # Create Confusion Matrix
    print(f'Logistic Regression Confusion Matrix for {task} label classification for {embedding} embedding:')
    print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
    
    #Print accuracy scores
    nb_acc_score=accuracy_score(y_test,y_pred)
    print(f'Logistic Regression for {task} label classification for {embedding} embedding: {nb_acc_score}')
    
    #Print other evaluation scores
    print(classification_report(y_test, y_pred))

    return

### MODEL IMPLEMENTATION
There are four calls to each classification algorithm - since there are 2 embedding types and 2 labels. 

##### Random Forest Classification - Model implementation

In [16]:
#Spacy embedding - preference labels
random_forest_classification(train_spacy_sentences,train_pref_labels,test_spacy_sentences,test_pref_labels,'Preferences','Spacy')

Random Forest Confusion Matrix for Preferences label classification for Spacy embedding:
Predicted    N    O    T
Actual                  
N          150   73   67
O           53  214   76
T           33   49  285


Random Forest Accuracy for Preferences label classification for Spacy embedding: 0.649


              precision    recall  f1-score   support

           N       0.64      0.52      0.57       290
           O       0.64      0.62      0.63       343
           T       0.67      0.78      0.72       367

    accuracy                           0.65      1000
   macro avg       0.65      0.64      0.64      1000
weighted avg       0.65      0.65      0.64      1000



In [17]:
#Spacy embedding - comparison labels
random_forest_classification(train_spacy_sentences,train_comp_labels,test_spacy_sentences,test_comp_labels,'Comparison','Spacy')

Random Forest Confusion Matrix for Comparison label classification for Spacy embedding:
Predicted   0    1    2
Actual                 
0          65   46   37
1          18  360   83
2          13  113  265


Random Forest Accuracy for Comparison label classification for Spacy embedding: 0.69


              precision    recall  f1-score   support

           0       0.68      0.44      0.53       148
           1       0.69      0.78      0.73       461
           2       0.69      0.68      0.68       391

    accuracy                           0.69      1000
   macro avg       0.69      0.63      0.65      1000
weighted avg       0.69      0.69      0.68      1000



In [18]:
#TF-IDF embedding - preference labels
random_forest_classification(train_tfidf_sentences,train_pref_labels,test_tfidf_sentences,test_pref_labels,'Preferences','TF-IDF')

Random Forest Confusion Matrix for Preferences label classification for TF-IDF embedding:
Predicted    N    O    T
Actual                  
N          177   61   52
O           50  248   45
T           37   39  291


Random Forest Accuracy for Preferences label classification for TF-IDF embedding: 0.716


              precision    recall  f1-score   support

           N       0.67      0.61      0.64       290
           O       0.71      0.72      0.72       343
           T       0.75      0.79      0.77       367

    accuracy                           0.72      1000
   macro avg       0.71      0.71      0.71      1000
weighted avg       0.71      0.72      0.71      1000



In [19]:
#TF-IDF embedding - comparison labels
random_forest_classification(train_tfidf_sentences,train_comp_labels,test_tfidf_sentences,test_comp_labels,'Comparison','TF-IDF')

Random Forest Confusion Matrix for Comparison label classification for TF-IDF embedding:
Predicted   0    1    2
Actual                 
0          69   54   25
1          11  418   32
2           3   94  294


Random Forest Accuracy for Comparison label classification for TF-IDF embedding: 0.781


              precision    recall  f1-score   support

           0       0.83      0.47      0.60       148
           1       0.74      0.91      0.81       461
           2       0.84      0.75      0.79       391

    accuracy                           0.78      1000
   macro avg       0.80      0.71      0.73      1000
weighted avg       0.79      0.78      0.77      1000



##### Logistic Regression Classification - Model implementation

In [20]:
#Spacy embedding - preference labels
multi_logistic_regression_classification(train_spacy_sentences,train_pref_labels,test_spacy_sentences,test_pref_labels,'Preferences','Spacy')

Logistic Regression Confusion Matrix for Preferences label classification for Spacy embedding:
Predicted    N    O    T
Actual                  
N          139   78   73
O           70  180   93
T           57   62  248
Logistic Regression for Preferences label classification for Spacy embedding: 0.567
              precision    recall  f1-score   support

           N       0.52      0.48      0.50       290
           O       0.56      0.52      0.54       343
           T       0.60      0.68      0.64       367

    accuracy                           0.57      1000
   macro avg       0.56      0.56      0.56      1000
weighted avg       0.56      0.57      0.56      1000



In [21]:
#Spacy embedding - comparison labels
multi_logistic_regression_classification(train_spacy_sentences,train_comp_labels,test_spacy_sentences,test_comp_labels,'Comparison','Spacy')

Logistic Regression Confusion Matrix for Comparison label classification for Spacy embedding:
Predicted   0    1    2
Actual                 
0          21   59   68
1          25  321  115
2          12  112  267
Logistic Regression for Comparison label classification for Spacy embedding: 0.609
              precision    recall  f1-score   support

           0       0.36      0.14      0.20       148
           1       0.65      0.70      0.67       461
           2       0.59      0.68      0.63       391

    accuracy                           0.61      1000
   macro avg       0.54      0.51      0.50      1000
weighted avg       0.59      0.61      0.59      1000



In [22]:
#TF-IDF embedding - preference labels
multi_logistic_regression_classification(train_tfidf_sentences,train_pref_labels,test_tfidf_sentences,test_pref_labels,'Preferences','TF-IDF')

Logistic Regression Confusion Matrix for Preferences label classification for TF-IDF embedding:
Predicted    N    O    T
Actual                  
N          189   69   32
O           41  267   35
T           31   39  297
Logistic Regression for Preferences label classification for TF-IDF embedding: 0.753
              precision    recall  f1-score   support

           N       0.72      0.65      0.69       290
           O       0.71      0.78      0.74       343
           T       0.82      0.81      0.81       367

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000



In [23]:
#TF-IDF embedding - comparison labels
multi_logistic_regression_classification(train_tfidf_sentences,train_comp_labels,test_tfidf_sentences,test_comp_labels,'Comparison','TF-IDF')

Logistic Regression Confusion Matrix for Comparison label classification for TF-IDF embedding:
Predicted   0    1    2
Actual                 
0          49   47   52
1           9  379   73
2           5   63  323
Logistic Regression for Comparison label classification for TF-IDF embedding: 0.751
              precision    recall  f1-score   support

           0       0.78      0.33      0.46       148
           1       0.78      0.82      0.80       461
           2       0.72      0.83      0.77       391

    accuracy                           0.75      1000
   macro avg       0.76      0.66      0.68      1000
weighted avg       0.75      0.75      0.74      1000



In [24]:
print("*" * 100)
print("BASELINE MODEL ENDS HERE")
print("*" * 100)

****************************************************************************************************
BASELINE MODEL ENDS HERE
****************************************************************************************************


In [25]:
print("*" * 100)
print("PROPOSED MODEL STARTS HERE")
print("*" * 100)

****************************************************************************************************
PROPOSED MODEL STARTS HERE
****************************************************************************************************


# <FONT COLOR = "RED">PROPOSED MODEL </FONT>

### PREPROCESSING - STOP WORDS REMOVAL AND LEMMATIZATION

For our baseline model we generated a `cleaned_sentence` column in our dataset. This sentence had no punctuations, whitespaces and capital letters. We perform 

In [26]:
#Remove leading and trailing white spaces
stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#Remove stop words
def remove_stopwords(sentence):
    return ' '.join([word for word in sentence.split() if word not in stopwords])

#Perform lemmatization to get root words. 
def get_lemmatized_word(sentence):
    return ' '.join([lemmatizer.lemmatize(word) for word in sentence.split()])

##### Train Data

In [27]:
# #Clean the sentence further by removing stopwords and getting lemmatized word
train_data['extra_cleaned_sentence'] = train_data['cleaned_sentence'].apply(lambda s: remove_stopwords(s)).apply(lambda s: get_lemmatized_word(s))

##### Test Data

In [28]:
# #Clean tokens by removing stopwords and getting lemmatized word
test_data['extra_cleaned_sentence'] = test_data['cleaned_sentence'].apply(lambda s: remove_stopwords(s)).apply(lambda s: get_lemmatized_word(s))

### WORD EMBEDDINGS -- UNIVERSAL SENTENCE ENCODER
In this module Google's universal sentence encoder is used to generate sentence embeddings. Sentence Transformers were used to generate the embeddings as well but the model performance was comparitively better when using this technique

In [29]:
#Load the encoder
universal_sentence_encoder  = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

#Generate sentence embeddings using universal sentence encoder.
train_US_embeddings = universal_sentence_encoder(train_data['cleaned_sentence'].values.tolist())
test_US_embeddings = universal_sentence_encoder(test_data['cleaned_sentence'].values.tolist())

INFO:absl:Using /tmp/tfhub_modules to cache modules.


### MODEL
1. XGBoost is a boosting based ensemble decision tree algorithm which as been used to model the data. 200 weak estimators have been used to train the model. Refer to the function xgboost_classification
2. The same evaluation metrics used in baseline model have also been used here. 


In [30]:
def xgboost_classification(X_train,y_train,X_test,y_test,task, embedding):
  classifier = XGBClassifier(n_estimators=200)
  classifier.fit(X_train, y_train)

  # Predict the test set results
  y_pred = classifier.predict(X_test)
  y_pred_probs = classifier.predict_proba(X_test)
  if (task == 'Preferences'):
      y_test = np.vectorize(reverse_mapping_dict.get)(y_test)
      y_pred = np.vectorize(reverse_mapping_dict.get)(y_pred)
  
  # Create Confusion Matrix
  print(f'XGBoost Confusion Matrix for {task} label classification for {embedding} embedding:')
  print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))
  
  #Print accuracy scores
  nb_acc_score=accuracy_score(y_test,y_pred)
  print(f'XGBoost for {task} label classification for {embedding} embedding: {nb_acc_score}')
  
  #Print other evaluation scores
  print(classification_report(y_test, y_pred))

  return

##### XGBoost Classification - Model implementation

In [31]:
xgboost_classification(train_US_embeddings.numpy(),train_pref_labels,test_US_embeddings.numpy(),test_pref_labels,'Preferred','Universal Sentence Encoder')

XGBoost Confusion Matrix for Preferred label classification for Universal Sentence Encoder embedding:
Predicted    1    2    3
Actual                  
1          289   47   31
2           49  241   53
3           47   47  196
XGBoost for Preferred label classification for Universal Sentence Encoder embedding: 0.726
              precision    recall  f1-score   support

           1       0.75      0.79      0.77       367
           2       0.72      0.70      0.71       343
           3       0.70      0.68      0.69       290

    accuracy                           0.73      1000
   macro avg       0.72      0.72      0.72      1000
weighted avg       0.73      0.73      0.73      1000



In [32]:
xgboost_classification(train_US_embeddings.numpy(),train_comp_labels,test_US_embeddings.numpy(),test_comp_labels,'Comparison','Universal Sentence Encoder')

XGBoost Confusion Matrix for Comparison label classification for Universal Sentence Encoder embedding:
Predicted   0    1    2
Actual                 
0          77   50   21
1          13  403   45
2           7   64  320
XGBoost for Comparison label classification for Universal Sentence Encoder embedding: 0.8
              precision    recall  f1-score   support

           0       0.79      0.52      0.63       148
           1       0.78      0.87      0.82       461
           2       0.83      0.82      0.82       391

    accuracy                           0.80      1000
   macro avg       0.80      0.74      0.76      1000
weighted avg       0.80      0.80      0.80      1000



In [33]:
print("*" * 100)
print("PROPOSED MODEL ENDS HERE")
print("*" * 100)

****************************************************************************************************
PROPOSED MODEL ENDS HERE
****************************************************************************************************
