# Text Summarizaiton Using Traditional Model

In [1]:
!pip install nltk



In [2]:
!pip install imblearn



In [3]:
import pandas as pd
import string
import re
import pickle
import zipfile

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
from multiprocessing import Pool

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kesha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
nltk.download('wordnet', download_dir='/root/nltk_data')

with zipfile.ZipFile('/root/nltk_data/corpora/wordnet.zip', 'r') as zip_ref:
    zip_ref.extractall('/root/nltk_data/corpora/')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
train_df = pd.read_csv("../Dataset/cnn_dailymail/train.csv")

In [8]:
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [9]:
len(train_df)

287113

In [10]:
train_df = train_df[:6000]

len(train_df)

6000

In [11]:
def preprocess_text(text):

    lower = text.lower()
    tokens = word_tokenize(lower)

    tokens = [word for word in tokens if word not in string.punctuation]
    tokens = [word for word in tokens if not re.match(r'\d+', word)]

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    lemma = WordNetLemmatizer()
    lemma_tokens = [lemma.lemmatize(word) for word in filtered_tokens]

    return ' '.join(lemma_tokens)

In [12]:
%%time

sentences = []
labels = []

for _, row in train_df.iterrows():
  article = row['article']
  sumary = row['highlights']

  processed_sumary = set(preprocess_text(sent) for sent in sent_tokenize(sumary))
  sentence = sent_tokenize(article)

  for sent in sentence:
   processed_sentences = preprocess_text(sent)

   if processed_sentences in processed_sumary:
      sentences.append(sent)
      labels.append(1)
   else:
    sentences.append(sent)
    labels.append(0)

CPU times: total: 37.3 s
Wall time: 1min 13s


In [13]:
tfidf = TfidfVectorizer(max_features=500)
scaled = MinMaxScaler()

x = tfidf.fit_transform(sentences).toarray()
x_scaled = scaled.fit_transform(x)

### Reducing the dimensions

In [14]:
# %%time

# n_comp = 100

# lsa = TruncatedSVD(n_components=n_comp, random_state=42)
# reduced_x = lsa.fit_transform(x_scaled)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, labels, test_size=0.3, random_state=42)

In [16]:
smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)

### Finding the best Parameters for effective predictions

In [17]:
def metrics_score(model, x_test, y_test):

  prediction = model.predict(x_test)
  true = y_test

  dic = {
      'Accuracy': accuracy_score(true, prediction),
      'Precision': precision_score(true, prediction),
      'Recall': recall_score(true, prediction),
      'F1 Score': f1_score(true, prediction)
  }

  return dic

In [18]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

In [19]:
%%time

svc_grid = GridSearchCV(estimator=SVC(class_weight='balanced', max_iter=15), param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

svc_grid.fit(x_train, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
CPU times: total: 7.28 s
Wall time: 1h 11min 46s




In [20]:
model = svc_grid.best_estimator_

print(f"Best Parameters: {svc_grid.best_params_}")
print(f"Best Cross Validation Score: {svc_grid.best_score_}")

Best Parameters: {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
Best Cross Validation Score: 0.7899050277634494


In [21]:
scores = metrics_score(model=model, x_test=x_test, y_test=y_test)

scores = pd.DataFrame(scores, index=[0])
scores

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
0,0.959629,0.00295,0.049383,0.005567


### Getting the Summary

In [32]:
def get_summary(text):
  sentences = [preprocess_text(sent) for sent in sent_tokenize(text)]
  print(sentences)
  print("\n")
    
  sent_vector = tfidf.transform(sentences).toarray()
  # print(sent_vector.shape)
  sent_vector = scaled.transform(sent_vector)
    
  # reduced_vector = lsa.transform(sent_vector)
  # print(reduced_vector.shape)
  
  predictions = model.predict(sent_vector)
  # print(predictions)

  relevant_sentences = [sent for sent, preds in zip(sentences, predictions) if preds == 1]
  # print(relevant_sentences)
  summary = " ".join(relevant_sentences)

  return summary

In [33]:
test_df = pd.read_csv("../Dataset/cnn_dailymail/validation.csv")

In [36]:
text = test_df['article'][8]

In [37]:
get_summary(text)

['ronda rousey recorded fastest-ever finish ufc title fight submitted cat zingano second los angeles', 'rousey expected face toughest examination reign bantamweight champion unbeaten zingano', 'avoided flying knee opening second rousey took opponent set work trying execute trademark armbar', 'scroll watch rousey beat zingano second', 'ronda rousey manoeuvre position submit cat zingano second fight', 'rousey attempt lock trademark arm bar finish defended bantamweight title', 'rousey console zingano stunning victory inside second staple center los angeles', 'rousey grapple zingano celebrating octagon record-breaking victory', 'ronda rousey bt cat zingano via sub', 'holly holm bt raquel pennington via sd', 'jake ellenberger bt josh koscheck via sub', 'alan jouban bt richard walsh via ko', 'tony ferguson bt gleison tibau via sub', 'roan carneiro bt mark munoz via sub', 'roman salazar bt norifumi yamamoto n/c', 'tim mean bt dhiego lima via tko', 'derrick lewis bt ruan potts via tko', 'valmi

'rousey attempt lock trademark arm bar finish defended bantamweight title rousey grapple zingano celebrating octagon record-breaking victory ronda rousey bt cat zingano via sub holly holm bt raquel pennington via sd jake ellenberger bt josh koscheck via sub alan jouban bt richard walsh via ko tony ferguson bt gleison tibau via sub roan carneiro bt mark munoz via sub roman salazar bt norifumi yamamoto n/c tim mean bt dhiego lima via tko derrick lewis bt ruan potts via tko valmir lazaro bt james krause via sd masio fullen bt alexander torres via sd rousey forced challenger tap middleweight champion chris weidman originally scheduled fight vitor belfort withdraw injured holm moved boxing career mixed martial art remains unbeaten'

### *Conclusion => Model is predicting the summary quite accurately*

# Saving the processor and model files

In [38]:
with open("tfidf.pkl", "wb") as vector_file:
    pickle.dump(tfidf, vector_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaled, scaler_file)

with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
    