In [17]:
import nltk

# Lab 1: Tokenization

In [47]:
DATA1 = '''Formula One (more commonly known as Formula 1 or F1) is the highest class of international racing for open-wheel single-seater formula racing cars sanctioned by the F�d�ration Internationale de l'Automobile (FIA). The FIA Formula One World Championship has been one of the premier forms of racing around the world since its inaugural season in 1950. The word formula in the name refers to the set of rules to which all participants' cars must conform. A Formula One season consists of a series of races, known as Grands Prix. Grands Prix take place in multiple countries and continents around the world on either purpose - built circuits or closed public roads.'''

In [48]:
# using nltk
print(nltk.word_tokenize(DATA1))
print(nltk.sent_tokenize(DATA1))
print(nltk.WhitespaceTokenizer().tokenize(DATA1))

['Formula', 'One', '(', 'more', 'commonly', 'known', 'as', 'Formula', '1', 'or', 'F1', ')', 'is', 'the', 'highest', 'class', 'of', 'international', 'racing', 'for', 'open-wheel', 'single-seater', 'formula', 'racing', 'cars', 'sanctioned', 'by', 'the', 'F�d�ration', 'Internationale', 'de', "l'Automobile", '(', 'FIA', ')', '.', 'The', 'FIA', 'Formula', 'One', 'World', 'Championship', 'has', 'been', 'one', 'of', 'the', 'premier', 'forms', 'of', 'racing', 'around', 'the', 'world', 'since', 'its', 'inaugural', 'season', 'in', '1950', '.', 'The', 'word', 'formula', 'in', 'the', 'name', 'refers', 'to', 'the', 'set', 'of', 'rules', 'to', 'which', 'all', 'participants', "'", 'cars', 'must', 'conform', '.', 'A', 'Formula', 'One', 'season', 'consists', 'of', 'a', 'series', 'of', 'races', ',', 'known', 'as', 'Grands', 'Prix', '.', 'Grands', 'Prix', 'take', 'place', 'in', 'multiple', 'countries', 'and', 'continents', 'around', 'the', 'world', 'on', 'either', 'purpose', '-', 'built', 'circuits', 'or

In [49]:
def word_tokenizer(data:str) -> list:
  tokens = []
  token = ''
  for chr in data:
    if (chr in [' ', '(', ')', '[', ']', '.']):
      if (len(token) > 0): tokens.append(token)
      token = ''
    else:
      token += chr
  return tokens

In [50]:
def sent_tokenizer(data:str, max_thresh = 3) -> list:
  tokens = []
  token = ''
  for chr in data:
    token += chr
    if (chr == '\n'):
      tokens.append(token)
      token = ''
    if (chr == '.'):
      if (len(word_tokenizer(token).pop()) > max_thresh):
        tokens.append(token)
        token = ''
  return tokens

In [51]:
print(word_tokenizer(DATA1))
print(sent_tokenizer(DATA1))

['Formula', 'One', 'more', 'commonly', 'known', 'as', 'Formula', '1', 'or', 'F1', 'is', 'the', 'highest', 'class', 'of', 'international', 'racing', 'for', 'open-wheel', 'single-seater', 'formula', 'racing', 'cars', 'sanctioned', 'by', 'the', 'F�d�ration', 'Internationale', 'de', "l'Automobile", 'FIA', 'The', 'FIA', 'Formula', 'One', 'World', 'Championship', 'has', 'been', 'one', 'of', 'the', 'premier', 'forms', 'of', 'racing', 'around', 'the', 'world', 'since', 'its', 'inaugural', 'season', 'in', '1950', 'The', 'word', 'formula', 'in', 'the', 'name', 'refers', 'to', 'the', 'set', 'of', 'rules', 'to', 'which', 'all', "participants'", 'cars', 'must', 'conform', 'A', 'Formula', 'One', 'season', 'consists', 'of', 'a', 'series', 'of', 'races,', 'known', 'as', 'Grands', 'Prix', 'Grands', 'Prix', 'take', 'place', 'in', 'multiple', 'countries', 'and', 'continents', 'around', 'the', 'world', 'on', 'either', 'purpose', '-', 'built', 'circuits', 'or', 'closed', 'public', 'roads']
["Formula One (m

# Lab 2: Porter Stemmer

In [91]:
def is_vowel(letter:str) -> bool:
  if (letter in ['a', 'e', 'i', 'o', 'u']):
    return True
  if (letter == 'y'):
    return True
  return False

def is_consonant(letter:str) -> bool:
  return not is_vowel(letter)

def x_V_x(stem:str) -> bool:
  for letter in stem:
    if (is_vowel(letter)):
      return True
  return False

def x_D(stem:str) -> bool:
  l1 = stem[-1]
  l2 = stem[-2]
  return is_consonant(l1) and is_consonant(l2)

def x_O(stem:str) -> bool:
  if (len(stem) < 3): return False

  c1 = stem[-3]
  v = stem[-2]
  c2 = stem[-1]

  if (c2 in ['w', 'x', 'y']):
    return False
  
  return is_consonant(c1) and is_vowel(v) and is_consonant(c2)

def get_form(stem:str) -> str:
  form = ' '

  for letter in stem:
    if (is_vowel(letter)):
      if (form[-1] != 'v'):
        form += 'v'
    else:
      if (form[-1] != 'c'):
        form += 'c'

  return form

def get_m(stem:str) -> int:
  form = get_form(stem)
  return form.count('vc')

In [92]:
def step1(stem:str) -> str:
  if (stem.endswith('sses')):
    return stem.replace('sses', 'ss')
  if (stem.endswith('ies')):
    return stem.replace('ies', 'i')
  if (stem.endswith('ss')):
    # return stem.replace('ss', 'ss')
    return stem
  if (stem.endswith('s')):
    # return stem.replace('s', '')
    return stem[0:-1]
  return stem

In [102]:
def step2b(stem:str) -> str:
  if (stem.endswith('at')):
    return stem + 'e'
  if (stem.endswith('bl')):
    return stem + 'e'
  if (x_O(stem) and not (stem.endswith(('l', 's', 'z')))):
    return stem[0:-1]
  if (get_m(stem) > 1 and x_O(stem)):
    return stem + 'e'
  return stem

def step2(stem:str) -> str:
  if (get_m(stem) > 1 and stem.endswith('eed')):
    return stem.replace('eed', 'ee')
  if (x_V_x(stem) and stem.endswith('ed')):
    # return step2b(stem.replace('ed', ''))
    return step2b(stem[0:-2])
  if (x_V_x(stem) and stem.endswith('ing')):
    # return step2b(stem.replace('ing', ''))
    return step2b(stem[0:-3])
  return stem

In [94]:
def step3(stem:str) -> str:
  if (x_V_x(stem) and stem.endswith('y')):
    stem[-1] = 'i'
    return stem
    # return stem.replace('y', 'i')
  return stem

In [95]:
def step4(stem:str) -> str:
  if (get_m(stem) > 0):
    if (stem.endswith('ational')):
      return stem.replace('ational', 'ate')
    if (stem.endswith('ization')):
      return stem.replace('ization', 'ize')
    if (stem.endswith('biliti')):
      return stem.replace('biliti', 'ble')
  return stem

In [96]:
def step5(stem:str) -> str:
  if (get_m(stem) > 0):
    if (stem.endswith('icate')):
      return stem.replace('icate', 'ic')
    if (stem.endswith('ful')):
      return stem[0:-3]
    if (stem.endswith('ness')):
      return stem[0:-4]
  return stem

In [97]:
def step6(stem:str) -> str:
  if (get_m(stem) > 0):
    if (stem.endswith('ance')):
      return stem.replace('ance', '')
    if (stem.endswith('ent')):
      return stem.replace('ent', '')
    if (stem.endswith('ive')):
      return stem.replace('ive', '')
    if (stem.endswith('ize')):
      return stem.replace('ize', '')
  return stem

In [98]:
def step7a(stem:str) -> str:
  if (get_m(stem) > 1 and stem.endswith('e')):
    return stem[0:-1]
  if (((get_m(stem) == 1) and not x_O(stem)) and stem.endswith('ness')):
    return stem.replace('ness', '')
    # return stem[0:-4]
  return stem
  
def step7b(stem:str) -> str:
  if ((get_m(stem) > 1) and x_D(stem) and stem.endswith('l')):
    return stem[0:-1]
  return stem

In [99]:
def PorterStemmer(word:str) -> str:
  stem = word
  stem = step1(stem)
  stem = step2(stem)
  stem = step3(stem)
  stem = step4(stem)
  stem = step5(stem)
  stem = step6(stem)
  stem = step7a(stem)
  stem = step7b(stem)
  return stem

In [103]:
for word in ['computers', 'singing', 'controlling', 'generalizations', 'elephants', 'doing']:
  print(PorterStemmer(word))

computer
sing
control
general
elephant
do


# Lab 2: Lemmatization

In [108]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ['kites', 'babies', 'dogs', 'flying', 'smiling', 'driving', 'died', 'tried', 'feet']
for word in words:
  print(word + ": " + lemmatizer.lemmatize(word))

kites: kite
babies: baby
dogs: dog
flying: flying
smiling: smiling
driving: driving
died: died
tried: tried
feet: foot


In [21]:
def lemmatize(word):
  word = word.lower()

  # Handling irregular nouns
  irregular_nouns = {
    'men': 'man',
    'women': 'woman',
    'children': 'child',
    'teeth': 'tooth',
    'feet': 'foot',
    'mice': 'mouse'
  }
  if word in irregular_nouns:
    return irregular_nouns[word]
  
  # Handling regular plural nouns
  if word.endswith('ies'):
    if len(word) > 3:
      return word[:-3] + 'y'
  elif word.endswith('es'):
    if word[-3] in "s,x,z,o":
      return word[:-2]
    else:
      return word[:-1]
  elif word.endswith('s'):
    return word[:-1]
  
  # Handling past tense and past participle of regular verbs
  if word.endswith('ed'):
    if len(word) > 2:
      if word[-3] == word[-4]:  # e.g., stopped -> stop
        return word[:-3]
      return word[:-2]
    
  # Handling present participle of regular verbs
  if word.endswith('ing'):
    if len(word) > 3:
      if word[-4] == word[-5]:  # e.g., running -> run
        return word[:-4]
      return word[:-3]
  # Return the word if no rules are applied
  return word

In [22]:
words = ["running", "stopped", "children", "boxes", "teeth"]
lemmatized_words = [lemmatize(word) for word in words]
print(lemmatized_words)

['run', 'stop', 'child', 'box', 'tooth']


# Lab 3: HMM Viterbi

In [128]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     /home/jaynakum/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jaynakum/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [129]:
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
print(nltk_data[:2])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [130]:
from sklearn.model_selection import train_test_split
# split data into training and validation set in the ratio 80:20
train_set,test_set = train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

# create list of train and test tagged words
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))
train_tagged_words[:5]

80310
20366


[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [131]:
# check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

12
{'PRT', 'X', 'ADV', 'ADJ', 'ADP', 'NOUN', 'NUM', 'CONJ', 'VERB', 'DET', '.', 'PRON'}


In [132]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
  tag_list = [pair for pair in train_bag if pair[1]==tag]
  count_tag = len(tag_list) # total number of times the passed tag occurred in train_bag
  w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
# now calculate the total number of times the passed word occurred as the passed tag.
  count_w_given_tag = len(w_given_tag_list)
  return (count_w_given_tag, count_tag)

In [133]:
# compute Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
  tags = [pair[1] for pair in train_bag]
  count_t1 = len([t for t in tags if t==t1])
  count_t2_t1 = 0
  for index in range(len(tags)-1):
    if tags[index]==t1 and tags[index+1] == t2:
      count_t2_t1 += 1
  return (count_t2_t1, count_t1)

In [134]:
import numpy as np
import pandas as pd

# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
  for j, t2 in enumerate(list(tags)):
    tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,PRT,X,ADV,ADJ,ADP,NOUN,NUM,CONJ,VERB,DET,.,PRON
PRT,0.001174,0.012133,0.009393,0.082975,0.019569,0.250489,0.056751,0.002348,0.401174,0.10137,0.04501,0.017613
X,0.185086,0.075726,0.025754,0.017682,0.142226,0.061695,0.003075,0.010379,0.206419,0.05689,0.160869,0.0542
ADV,0.01474,0.022886,0.081458,0.130721,0.119472,0.032196,0.029868,0.006982,0.339022,0.071373,0.139255,0.012025
ADJ,0.011456,0.020971,0.005243,0.063301,0.080583,0.696893,0.021748,0.016893,0.011456,0.005243,0.066019,0.000194
ADP,0.001266,0.034548,0.014553,0.107062,0.016958,0.323589,0.063275,0.001012,0.008479,0.320931,0.038724,0.069603
NOUN,0.043935,0.028825,0.016895,0.012584,0.176827,0.262344,0.009144,0.042454,0.149134,0.013106,0.240094,0.004659
NUM,0.026062,0.202428,0.00357,0.035345,0.037487,0.35166,0.18422,0.014281,0.020707,0.00357,0.119243,0.001428
CONJ,0.004391,0.00933,0.05708,0.113611,0.055982,0.349067,0.040615,0.000549,0.150384,0.123491,0.035126,0.060373
VERB,0.030663,0.21593,0.083886,0.06639,0.092357,0.110589,0.022836,0.005433,0.167956,0.13361,0.034807,0.035543
DET,0.000287,0.045134,0.012074,0.206411,0.009918,0.635906,0.022855,0.000431,0.040247,0.006037,0.017393,0.003306


In [135]:
def Viterbi(words, train_bag = train_tagged_words):
  state = []
  T = list(set([pair[1] for pair in train_bag]))
  for key, word in enumerate(words):
    #initialise list of probability column for a given observation
    p = []
    for tag in T:
      if key == 0:
        transition_p = tags_df.loc['.', tag]
      else:
        transition_p = tags_df.loc[state[-1], tag]
      # compute emission and state probabilities
      emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
      state_probability = emission_p * transition_p
      p.append(state_probability)
    pmax = max(p)
    # getting state for which probability is maximum
    state_max = T[p.index(pmax)]
    state.append(state_max)
  return list(zip(words, state))

In [136]:
test_sent = "Will can see Marry"
pred_tags_withoutRules = Viterbi(test_sent.split())
print(pred_tags_withoutRules)

[('Will', 'PRT'), ('can', 'VERB'), ('see', 'VERB'), ('Marry', 'PRT')]


# Lab 4: Turney

In [13]:
import math
import re
import json
import nltk
nltk.download('averaged_perceptron_tagger',quiet=True)

True

In [14]:
def loadReviews(fileName):
  list_pos = []
  list_neg = []
  data = []
  with open(fileName, 'r') as f:
    for line in f:
      data.append(json.loads(line))
  for elem in data:
    if float(elem["overall"]) >= 3.0:
      list_pos.append(elem["reviewText"])
    else:
      list_neg.append(elem["reviewText"])
  return list_pos, list_neg

In [15]:
def make_datasets(fileName):
  all_positive_reviews, all_negative_reviews = loadReviews(fileName)
  dataset = {'train': {'neg': [], 'pos': []}, 'test': {'neg': [], 'pos': []}}
  dataset['train']['pos'] = (all_positive_reviews[:20000])
  dataset['train']['neg'] = (all_negative_reviews[:20000])
  dataset['test']['pos'] = (all_positive_reviews[-50:])
  dataset['test']['neg'] = (all_negative_reviews[-50:])
  return dataset

In [16]:
def find_pattern(postag):
  tag_pattern = []
  for k in range(len(postag) - 2):
    if (postag[k][1] == "JJ" 
        and (postag[k + 1][1] == "NN" or postag[k + 1][1] == "NNS")):
      tag_pattern.append("".join(postag[k][0]) + " " + "".join(postag[k + 1][0]))

    elif ((postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1] == "RBS") 
          and postag[k + 1][1] == "JJ" 
          and postag[k + 2][1] != "NN" and postag[k + 2][1] != "NNS"):
      tag_pattern.append("".join(postag[k][0]) + " " + "".join(postag[k + 1][0]))
      
    elif (postag[k][1] == "JJ" 
          and postag[k + 1][1] == "JJ" 
          and postag[k + 2][1] != "NN" and postag[k + 2][1] != "NNS"):
      tag_pattern.append("".join(postag[k][0]) + " " + "".join(postag[k + 1][0]))
      
    elif ((postag[k][1] == "NN" or postag[k][1] == "NNS") 
          and postag[k + 1][1] == "JJ" 
          and postag[k + 2][1] != "NN" and postag[k + 2][1] != "NNS"):
      tag_pattern.append("".join(postag[k][0]) + " " + "".join(postag[k + 1][0]))

    elif ((postag[k][1] == "RB" or postag[k][1] == "RBR" or postag[k][1] == "RBS") 
          and (postag[k + 1][1] in ["VB", "VBD", "VBN", "VBG"])):
      tag_pattern.append("".join(postag[k][0]) + " " + "".join(postag[k + 1][0]))
      
  return tag_pattern

In [17]:
def near_operator(phrase, word, text):
  try:
    string = word + r'\W+(?:\w+\W+){0,400}?' + phrase + r'|' + phrase + r'\W+(?:\w+\W+){0,400}?' + word
    freq_phrase_near_word = (len(re.findall(string, text)))
    return freq_phrase_near_word
  except:
    return 0

In [18]:
class Turney(object):
  def __init__(self, dataset):
    self.datasets = dataset
    self.pos_phrases_hits = []
    self.neg_phrases_hits = []
    self.pos_hits = 0.01
    self.neg_hits = 0.01
    self.accuracy = 0

  def turney(self):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for boolean, test_klass in enumerate(['pos', 'neg']):
      for i, data in enumerate(self.datasets['test'][test_klass]):
        print(str(i) + " out of " + str(len(self.datasets['test'][test_klass])) + " --> round " + str(boolean))
        phrases = find_pattern(nltk.pos_tag(nltk.word_tokenize(data)))
        if len(phrases) == 0:
          continue
        self.pos_phrases_hits = [0.01] * len(phrases)
        self.neg_phrases_hits = [0.01] * len(phrases)
        self.pos_hits = 0.01
        self.neg_hits = 0.01
        for train_klass in ['pos', 'neg']:
          for text in self.datasets['train'][train_klass]:
            for ind, phrase in enumerate(phrases):
              self.pos_phrases_hits[ind] += near_operator(phrase, "excellent", text)
              self.neg_phrases_hits[ind] += near_operator(phrase, "poor", text)
              self.pos_hits += text.count("excellent")
              self.neg_hits += text.count("poor")
        res = self.calculate_sentiment(boolean)
        # compute if correct prediction
        if res == 1 and boolean == 0:
          fp += 1
        elif res == 1 and boolean == 1:
          tp += 1
        elif res == 0 and boolean == 0:
          fn += 1
        elif res == 0 and boolean == 1:
          tn += 1
    print("Accuracy: " + str(self.accuracy / 100))
    print("True positive: " + str(tp))
    print("False positive: " + str(fp))
    print("True negative: " + str(tn))
    print("False negative: " + str(fn))
    print("Recall-positive: " + str(tp / (tp + fn)))
    print("Precision-positive: " + str(tp / (tp + fp)))
    print("Recall-negative: " + str(tn / (tn + fp)))
    print("Precision-negative: " + str(tn / (tn + fn)))

  def calculate_sentiment(self, is_negative=0):
    polarities = [0] * len(self.pos_phrases_hits)
    for i in range(len(self.pos_phrases_hits)):
      polarities[i] = math.log((self.pos_phrases_hits[i] * self.neg_hits) / (self.neg_phrases_hits[i] * self.pos_hits), 2)
    pmi = sum(polarities) / len(polarities)
    if (pmi > 0 and is_negative == 0) or (pmi < 0 and is_negative == 1):
      self.accuracy += 1
      return 1
    return 0

In [None]:
datasets = make_datasets('./datasets/Cell_Phones_and_Accessories_5.json')
turney = Turney(datasets)
turney.turney()

# Lab 5: Supervised Sentiment Analysis

In [32]:
import pandas as pd

DATA = pd.read_csv('./datasets/full-corpus.csv')
DATA.head()

Unnamed: 0,Topic,Sentiment,TweetId,TweetDate,TweetText
0,apple,positive,126415614616154112,Tue Oct 18 21:53:25 +0000 2011,Now all @Apple has to do is get swype on the i...
1,apple,positive,126404574230740992,Tue Oct 18 21:09:33 +0000 2011,@Apple will be adding more carrier support to ...
2,apple,positive,126402758403305474,Tue Oct 18 21:02:20 +0000 2011,Hilarious @youtube video - guy does a duet wit...
3,apple,positive,126397179614068736,Tue Oct 18 20:40:10 +0000 2011,@RIM you made it too easy for me to switch to ...
4,apple,positive,126395626979196928,Tue Oct 18 20:34:00 +0000 2011,I just realized that the reason I got into twi...


In [34]:
from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(DATA['TweetText'], DATA['Sentiment'], test_size=0.2, random_state=42)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)

In [36]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(train_vectors, train_labels)

In [37]:
predictions = classifier.predict(test_vectors)

In [38]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(test_labels, predictions)
report = classification_report(test_labels, predictions)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)

Accuracy: 0.78
Classification Report:
               precision    recall  f1-score   support

  irrelevant       0.93      0.85      0.89       335
    negative       0.63      0.67      0.65       115
     neutral       0.76      0.84      0.80       464
    positive       0.60      0.47      0.53       111

    accuracy                           0.78      1025
   macro avg       0.73      0.71      0.71      1025
weighted avg       0.78      0.78      0.78      1025



# Lab 6: Text Summerization

In [42]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from collections import defaultdict

In [40]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jaynakum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jaynakum/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
document = """Science and technology often facilitate one another; the latest discoveries in one will lead to new discoveries in the other. Along with innovations in engineering, medicine, and many other fields, this co-evolution can also be seen in physics. The continuing improvements in technology, in particular artificial intelligence (AI) and machine learning (ML), open doors for physics researchers to explore more precise and in-depth topics — leading to new discoveries and a deeper understanding of our world.
With roots in statistical mechanics, the mathematical foundation of AI development is shared with many branches of physics, making the two natural counterparts. Since “physics” is an extremely broad subject area and covers many different fields, each field may utilize AI differently.
This article will briefly explore the uses of AI in a few different fields of physics, namely particle physics, astrophysics, medical physics, and condensed matter physics.
Particle physics dives into the fundamental forces and constituent parts which make up matter and radiation in the universe. In particle physics, artificial intelligence has been implemented to solve many problems. Classification, regression, and anomaly detection are all abilities that AI has that solve problems in particle physics. Artificial intelligence in particle physics is so common that physicists Dr. Matthew Feickert and Dr. Benjamin Nachman have compiled an organized reference page of particle physics research papers and reviews utilizing machine learning.
At the Stanford Linear Accelerator Center, run by Stanford University, physicists are using techniques inspired by computer vision to enhance the tagging and processing of images of streams of particles, called jets, produced as a result of the radioactive decay of heavy particles. The same scientists continued to build on their research and used deep learning tools to identify a charge-carrying particle called a W boson. This particle is one of two responsible for the weak force, one of four fundamental forces which govern the interaction of matter in our universe. These processes utilized tools from facial recognition, Monte Carlo simulations, and many more techniques and tools based in artificial intelligence.
The European Organization for Nuclear Research, more commonly referred to by the acronym CERN, comes from the French name “Conseil Européen pour la Recherche Nucléaire” and is home to some of the newest and most cutting-edge research in the field of particle physics. One of the most famous machines at CERN is the Large Hadron Collider, also known as the LHC — the world’s largest particle accelerator. Every year, CERN stores over 30 petabytes of data from experiments done with the LHC, the equivalent of 250 years of HD video. With this amount of data, storage would not be possible without filtering techniques, which is one of CERN’s major uses for machine learning. These machine learning techniques can also assist in pattern recognition and determining physical conclusions, which led to the discovery of the Higgs Boson in 2012.
Astrophysics covers the physical properties and phenomena behind stellar and astronomical objects in the universe. Similarly to the field of particle physics, astrophysical observations and research produce large quantities of data. The incorporation of AI in astrophysical research is thus heavily centered around data sorting; we can use machine learning algorithms to filter, sort, classify, and identify patterns in data.
Research groups in Chile and the United States have begun using a new classification model which aims to identify and classify variable objects directly from images using a deep-learning tool called a recurrent convolutional neural network (RCNN). A team led by Dr. Carrasco-Davis used real-world datasets to train and test the RCNN classification model. This method eliminates certain required steps for previously used classification techniques with images, such as calculating difference images or light curves.
Astrophysicists in the Netherlands and Belgium have also trained neural networks for their research, but with a different purpose. Hendriks and Aerts have used deep neural networks to model the cores of intermediate- and high-mass stars during the hydrogen-burning phase. This asteroseismological modelling, which studies vibrations in the matter that makes up the stars, has improved from previous techniques in speed and detail.
In the field of medical physics, concepts in physics are applied to diagnosis, treatment, and prevention in health care. This area, like many others in physics, has embraced the development of artificial intelligence over recent years. Using AI in this field can not only improve research, but can also improve procedures and efficiency in healthcare as a whole.
At Brown University, researchers have implemented deep learning techniques in their research to enhance identification of blockages in large blood vessels that could lead to strokes. By training neural networks, the group found that the model was sufficient in detecting blockages in large blood vessels and provided results with near-perfect accuracy.
Additionally, many research groups are using AI to reduce errors in diagnosis by assisting medical professionals in their decision-making. In 2020, a research group introduced an AI system that could detect earlier stages of breast cancer by studying mammogram images. This detection is done the same way a radiologist would look at the images: the model looks for deviations in the mammogram image compared to images with no cancer. Using a computerized system helps to standardize the procedure and reduce the workload for healthcare professionals. Advancements like this are critical to medical physics and health care in general, as many research centers report low numbers of radiologists. AI assistance can reduce workload and increase the quality and quantity of care medical professionals provide.
The field of condensed matter physics explores matter’s large-scale and small-scale properties, most commonly in solid and liquid states. This broad field is highly dominated by quantum mechanics, given the molecular interactions’ scale. Similarly, many modern technologies are also governed by quantum mechanics, as they rely on the fundamental interactions between light and matter and how information is carried on a microscopic level. Naturally, artificial intelligence and condensed matter physics pair well when applied together.
Researchers worldwide have begun using AI to assist in the research of quantum materials, an overarching term for material with properties that cannot be explained with classical or semiclassical physics. Using information and data already known about matter and compounds, AI has been able to examine and project the properties of quantum materials. Using the common framework Density Functional Theory, physicists can produce simulations for materials to determine their properties. Machine learning helps to take multidimensional problems and make them more physically understandable.
Unlike some other areas of physics, the ties between condensed matter physics and AI can be seen much more clearly, as this field of physics has dramatically impacted the advancements of AI. The theories of condensed matter physics have been applied explicitly to machine learning algorithms through the theories which characterize physical systems on various scales. This theoretical framework, called renormalization group, has been useful for analyzing systems with more than one constituent, referred to as a many-body problem. Through these theories, as well as many others, AI has been trained with more sophisticated algorithms to have then the ability to solve more complex problems.
Overall, many advancements in physics would not be possible without the assistance of artificial intelligence. Complex problems require innovative and creative solutions, and the union of physics and artificial intelligence provides the necessary building blocks for uncovering powerful answers. The most difficult situations may still be yet to come, but with continuous breakthroughs in science and technology, researchers will have many tools and techniques at their fingertips."""

In [44]:
sentences = sent_tokenize(document)
term_freq = defaultdict(int)
for sentence in sentences:
  for word in word_tokenize(sentence):
    if word.lower() not in stopwords.words('english'):
      term_freq[word.lower()] += 1

In [51]:
sentence_scores = defaultdict(int)
for sentence in sentences:
  for word in word_tokenize(sentence):
    if word.lower() in term_freq.keys():
      sentence_scores[sentence] += term_freq[word.lower()]
sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)

In [48]:
summary_length = int(len(sentences) * 0.3)
summary = " ".join(sorted_sentences[:summary_length])
print(summary)

This article will briefly explore the uses of AI in a few different fields of physics, namely particle physics, astrophysics, medical physics, and condensed matter physics. At the Stanford Linear Accelerator Center, run by Stanford University, physicists are using techniques inspired by computer vision to enhance the tagging and processing of images of streams of particles, called jets, produced as a result of the radioactive decay of heavy particles. In the field of medical physics, concepts in physics are applied to diagnosis, treatment, and prevention in health care. Unlike some other areas of physics, the ties between condensed matter physics and AI can be seen much more clearly, as this field of physics has dramatically impacted the advancements of AI. The incorporation of AI in astrophysical research is thus heavily centered around data sorting; we can use machine learning algorithms to filter, sort, classify, and identify patterns in data. Along with innovations in engineering, 

# Lab 7: Multi-document Text Summerization

In [2]:
DATA = ['''Unverified reports of '40 babies beheaded' in Israel-Hamas war inflame social media
No photo evidence had been made public as of Thursday morning corroborating claims that babies had been beheaded. Israel has published photos of dead infants after the terror attack.
Editor's note: This story includes graphic descriptions of violent acts that some readers may find disturbing.
A series of shocking reports have spread horrific claims of baby beheadings by Hamas militants across social and mainstream media in recent days, adding a particularly incendiary element to an already violent and bitter war. But the reports are still unconfirmed, and in some cases have been retracted.
The most high-profile claim came Wednesday night when President Joe Biden said that he had seen photographic evidence of terrorists beheading children. The White House later clarified that Biden was referring to news reports about beheadings, which have not included or referred to photographic evidence.
Photos have been published by Hamas showing beheaded soldiers and the X account belonging to Israeli Prime Minister Benjamin Netanyahu posted pictures on Thursday of babies killed and burned by Hamas. No photo evidence had been made public as of Thursday morning corroborating claims that babies had been beheaded.
Unverified information spreads quickly on social media, particularly around breaking news events, reaching even larger audiences when it is shared by mainstream news outlets, politicians and people with large followings. Follow-ups that retract or add context are less likely to be repeated or reach the same audience.
Biden's statement followed a series of news reports and comments from Israeli officials, most of which have since been softened or walked back. Easily debunked misinformation like fake press releases have circulated widely since the start of the war, but such stories often die down quickly once proven false. The claims about beheadings, difficult to verify, have continued to spread thanks in part to the lack of clarity.
Alexei Abrahams, a disinformation researcher at McGill University in Montreal, said that even without the allegations of beheaded babies, "just the facts themselves are horrifying enough to have the kind of effect you expect."
"It may turn out that the slaughter was done in a particularly barbaric way. But one way or another, this is an absolutely shocking, unprecedented event of violence," Abrahams said. "The general concern, of course, is that it's going to exacerbate what is already a very fraught situation."
On Wednesday, a spokesman for Israeli Prime Minister Benjamin Netanyahu told CNN that babies and toddlers were found with their "heads decapitated" in southern Israel after Hamas' attack. By Thursday morning, an Israeli official told CNN the government had not confirmed claims of the beheadings.
A senior State Department official said Thursday morning that the agency was not in a position to confirm the beheading claims.
Many of the reports appear to have originated from Israeli soldiers and people affiliated with the Israel Defense Force (IDF).
An IDF spokesperson told Business Insider on Tuesday that soldiers had found decapitated babies, but said Wednesday it would not investigate or provide further evidence regarding the claim. Late Wednesday, an IDF spokesperson said in a video on X that the IDF had "relative confidence" of the claims.
On Thursday, in a call with a group of international journalists, Colonel Golan Vach, the head of the IDF's national search and rescue unit, said that he had "found one baby with his head cut."
 Marc Owen Jones, an associate professor of Middle East studies at Hamad Bin Khalifa University in Qatar who studies misinformation, told NBC News that he found that the source of the "40 babies beheaded" allegations largely stemmed from a viral Israeli news broadcast clip that did not specifically refer to the allegation.
Nicole Zedeck, a correspondent for the privately owned Israeli news outlet i24NEWS, said in the video that Israeli soldiers told her they'd found "babies, their heads cut off." The video has been viewed more than 11 million times on X, according to its view counter. In another tweet, Zedeck wrote that soldiers told her they believe "40 babies/children were killed."
"Somehow those two bits of information were connected, the story became '40 babies were beheaded,' and in the British press today, about six or seven newspapers had it on their front pages," Jones said.
An IDF spokesperson, Doron Spielman, told NBC News on Tuesday that he could not confirm i24NEWS's report.
Yossi Landau, the head of operations for the southern region of Zaka, Israel's volunteer civilian emergency response organization, told CBS News that he saw the bodies of beheaded children and babies, parents and children who had been tortured and had their hands bound, and "a lot more that cannot be described for now, because it's very hard to describe."
By Wednesday, the claims, though still contentious, were going viral online — being used as evidence of Hamas' depravity. On Wednesday, the phrase "Did Hamas kill babies" saw the biggest increase in search interest on Google of anything related to the war.
"Stranger Things" star Noah Schnapp posted the shocking claim to his 25 million Instagram followers: "40 babies were beheaded and burned alive in front of their parents by Hamas." Sen. Ted Cruz, R-Texas, mentioned beheaded babies in a post on X, and Rep. Mike McCaul, R-Texas, echoed the allegations on CNN.
Jones found that the "40 babies beheaded" claim had over 44 million impressions on X, with over 300,000 likes and more than 100,000 reposts. The main accounts propagating the claims were i24NEWS and the official Israel account, Jones' data showed.
"Baby stories are very emotive. Historically, they're stories that can be used to rationalize a very brutal response," Jones said. "It's such a volatile information environment that such claims will inevitably be taken out of context, both deliberately and accidentally."'''
        ,
        '''The 'horrendous toll' on children caught in the Israel-Gaza conflict
Hundreds of children have been killed so far, with the true total still unclear.
The Israel-Hamas conflict is taking a "horrendous toll" on families, humanitarian organizations like UNICEF decried this week, amid reports of the slaughter and kidnapping of children and attacks on civilian infrastructure that have killed, injured or displaced the most vulnerable.
In the days since Hamas' surprise assault on Israel, images from both regions have shown crying children running through the street and cowering in bomb shelters after airstrikes. In Gaza, the bodies of dead children killed in shelling were covered in blankets and carried by their fathers in funeral processions. In the kibbutz of Be'eri, one of the largest in Israel, more than 100 bodies of Israeli citizens were discovered on Monday, with women, children and the elderly "brutally butchered," the Israel Defense Forces said. Israeli children have also been among those reported kidnapped by Hamas terrorists.
"Nothing justifies the killing, maiming or abducting of children -- grave rights violations which UNICEF wholeheartedly condemns. Yet less than 72 hours after the outbreak of horrific violence in Israel, reports indicate that grave rights violations against children are rampant," UNICEF Executive Director Catherine Russell said in a statement on Monday. "Many children have been killed or injured, while countless others have been exposed to the violence."
According to Palestinian authorities, 900 people have been killed in Gaza so far -- including 260 children and 230 women. The number of children killed in Israel is unclear; at least 900 people have died and 2,600 others have been injured, officials said, though did not specify how many were children. Prime Minister Benjamin Netanyahu said in a speech Monday that among the "atrocities" committed by Hamas, children have been "executed with the rest of their families."
In remarks at the White House Tuesday, President Joe Biden described "Hamas' bloodthirstiness" as reminiscent of ISIS rampages -- including "stomach-churning reports of babies being killed."
At least 100 civilians and soldiers have also been taken hostage by Hamas militants, Israeli officials said. Hamas leaders on Monday threatened to begin killing hostages one by one and filming the executions if their demands are not met.
Among those abducted were 12- and 16-year-old brothers, their mother told ABC News. The woman, who asked not to be identified for security reasons, said the boys were abducted Saturday by Hamas militants who burst into a safe room at their father's home on a kibbutz near the Gaza border.
"I want the world to demand the release of those innocent civilians. I want these children and women and babies back home, and I want my children back home," the mother said. "I can't take a shower without thinking of them being held hostage in some dirty pit somewhere. I can't eat, I can't sleep. I don't think human beings treat people like this. I'm sorry. I want the world to know, to demand those hostages to be returned to their homes."
In response to the assault, Israel on Monday carried out a "complete siege," cutting off power and blocking food and water from being delivered to the Gaza Strip -- where, according to the CIA, nearly 40% of the population of 2 million is under the age of 15.
UNICEF is "extremely alarmed" about those measures, spokesperson James Elder said at a press briefing Tuesday in Geneva.
"This will add another layer of suffering to the existing catastrophe faced by families in Gaza," Elder said. "Depriving children of access to food and essential services puts their lives at risk, as do attacks on civilian areas and infrastructure."
According to UNICEF, 80 of those who live in the Gaza Strip rely on some form of humanitarian assistance.
The conflict has led to "grave humanitarian consequences," Lynn Hastings, a humanitarian coordinator for the Gaza Strip for the United Nations Office for the Coordination of Humanitarian Affairs, while calling for all parties to abide by international humanitarian law.
"Civilians, especially children, medical facilities, humanitarian personnel health workers, and journalists must be protected," Hastings said in a statement Tuesday. "Captured civilians must be released immediately and unconditionally."
UNICEF has also called on all parties to protect children from harm, in accordance with international humanitarian law.
"I remind all parties that in this war, as in all wars, it is children who suffer first and suffer most," Russell said.'''
        ,
        '''What we actually know about the viral report of beheaded babies in Israel
One journalist from the Tel Aviv-based news channel i24 said a soldier told her they had "witnessed… bodies of babies with their heads cut off" at the Kfar Aza kibbutz near the Gaza border - but no Israeli officials have confirmed the claim.
Reports that Israeli soldiers discovered babies that had been beheaded in the Kfar Aza kibbutz are circulating on social and traditional media outlets around the world.
The Israel Defence Forces (IDF) invited foreign journalists to see the aftermath of a massacre by Hamas militants at the kibbutz on Tuesday.
Sky's chief correspondent Stuart Ramsay was among those to go and see "stretcher-bearers bringing out a small child" and a basketball court with "bodies lined up in black body bags".
But in her TV reports, one journalist from the Tel Aviv-based news channel i24 said a soldier had told her they had "witnessed… bodies of babies with their heads cut off".
In a statement to Sky News, the IDF said: "We cannot confirm any numbers. What happened in Kibbutz Kfar Aza is a massacre in which women, children and toddlers and elderly were brutally butchered in an ISIS way of action."
What happened at the kibbutz?
The Kfar Aza kibbutz is one of several self-contained Israeli settlements close to the Gaza border.
It is located between Netivot and Sderot - around three miles from the border in southern Israel.
Because of its proximity to Gaza and the unprecedented nature of last weekend's incursion, which saw Hamas militants breach the usually heavily guarded border on foot - it was one of the first sites they reached on Saturday.
Four days later, journalists got to see the destruction left behind.
Ramsay said the scene "can only be described as a massacre".
"The stories here are shocking - families being woken without warning to voices outside their houses, mums and dads hiding their children in cupboards, wine cellars and basements, husbands and wives becoming separated in the fight," he said.
He added it took 17 hours for help to arrive, as the IDF focused on urban areas first - leaving residents defenceless and numbers of dead high.
Why are there reports of 'babies being beheaded'?
Claims Hamas fighters beheaded babies have only been reported by one journalist - Nicole Zedek from i24 - and have not been verified by Sky News.
Ms Zedek was among the reporters invited to see what was left at the kibbutz on Tuesday.
In one live broadcast, which has since been viewed millions of times on X, formerly known as Twitter, she says: "Talking to some of the soldiers here, they say what they witnessed as they've been walking through these communities is bodies of babies with their heads cut off and families gunned down in their beds.
"We can see some of these soldiers right now, comforting each other."
She is also filmed speaking to the deputy commander of the IDF's unit 71, David Ben Zion, who describes Hamas fighters as "aggressive" and "very bad".
He says: "They cut off heads… of children, of women."
And in another live broadcast, Zedek describes "40 babies at least were taken out on gurneys" - which is where the widely shared 40 figure comes from.
In an interview with Sky's Mark Austin on Tuesday evening, Israeli economy minister Nir Barkat echoed a similar claim: "We've seen just now... we've heard of 40 young boys. Some of them were burned alive. Some were beheaded. Some were shot in the head."
CBS News in the US said on Wednesday that Yossi Landau, head of operations at Zaka, Israel's volunteer civilian emergency response organisation, confirmed to them he had "personally seen" adults, children and babies beheaded.
But when asked directly whether "40 babies were beheaded", an IDF spokesman said children were killed - but that reports of beheadings were "unconfirmed".
It was later reported by at least one major TV news network that the reports of babies being beheaded had been "confirmed" by a spokesperson for the Israeli prime minister's office.
This was subsequently attributed to Tal Heinrich, a freelance news anchor who appears to have been drafted in by Benjamin Netanyahu's office on 8 October to assist with media relations in the wake of Hamas's attacks a day earlier.
The only available public statement on the matter from Ms Heinrich at the time of writing was an interview she conducted with LBC on Wednesday, in which she was asked about the claims.
Ms Heinrich, who was quoted by LBC as a spokesperson for Mr Netanyahu's office, said: "Toddlers, babies, I can tell you some of them... yes, heads were cut off. This is what we are hearing from... soldiers on the ground who dealt with the bodies."
Replying to a later post on X linking to a story citing her comments, she said on Wednesday evening: "Please note: We said that these reports are based on testimonies of soldiers."
'Important to separate facts from speculation'
Ramsay interviewed two IDF majors - one of whom was a spokesman.
Ramsay said: "At no point did either he, or the other major I spoke to, ever mention that Hamas had beheaded or killed 40 babies or children. I believe that if it were the case, they would have told me and others there.
"There is no doubt that a horrific attack took place at Kfar Aza, and it needed to be reported, and we did see the bodies of the dead from the community in their houses, in the back of a truck, and on the basketball court.
"But it's important to separate the facts from speculation in a situation like this.
"To reiterate - the IDF had every opportunity to inform the world's media of any story that had become apparent as the military continue to clear up the kibbutz. The murder and beheading of 40 children was never mentioned to me or my team."
And another journalist, Oren Ziv, who works for independent news outlet 972 mag, was also present and given the opportunity to speak to "hundreds of soldiers on site".
In a post on X, he said of the baby claims: "During the tour we didn't see any evidence of this, and the army spokesperson or commanders also didn't mention any such incidents."
Footage shows how Hamas fighters broke into the kibbutz
Adding to the confusion, the White House was forced into a remarkable climbdown on Wednesday night after President Joe Biden appeared to confirm he had seen pictures of children being beheaded in the Hamas attack.
In a speech to a Jewish community gathering in Washington, which was televised live, he said: "I never really thought that I would see, have confirmed, pictures of terrorists beheading children, I never thought I would ever, anyway..."
However, after Sky News' US partner NBC approached the White House for further details on President Biden's remarks, two senior administration officials said Mr Biden was only referring to several media reports from Israel about beheaded children and had not seen images or had independent confirmation of child beheadings.
He had not in fact seen any images or had independent confirmation of child beheadings.
Digital investigations journalist Victoria Elms, who works on Sky's Data and Forensics unit, adds: "Social media has been awash with misinformation about the situation in Israel and Gaza since the war broke out.
"Videos from the Syrian conflict, excerpts from video games and TikToks made months ago have all been widely shared, falsely claiming to show events from the past few days."
She says misinformation is often shared "unintentionally", but "there are some who post and share false material with the intent to deceive others".
"This is especially dangerous during times of conflict, where it may be even harder than usual to independently verify information or footage," she says.
"As the conflict draws on, we would urge users to be vigilant when consuming online content related to the war.'''
        ,
        '''Israel releases horrific images of slain children after Hamas attack
JERUSALEM / TEL AVIV / BRUSSELS - CONTENT WARNING: This story contains graphic details that may not be suitable for all audiences. Reader discretion is advised.
Israel's government showed U.S. Secretary of State Antony Blinken and NATO defence ministers graphic images of dead children and civilians on Thursday, saying they were killed by Palestinian group Hamas as it builds support for its response.
Prime Minister Benjamin Netanyahu's office also released on social media a picture of a dead infant in a pool of blood and the charred body of a child, part of an apparent effort to stoke global anger against the Gaza militants over Saturday's attack.
Blinken, who flew into Tel Aviv earlier on Thursday, told reporters he was shown photographs and videos of a baby riddled with bullets, soldiers beheaded and young people burned alive in their cars or hideaways.
"It's simply depravity in the worst imaginable way," Blinken told a news briefing. "Images are worth a thousand words. These images may be worth a million."
Netanyahu has vowed to annihilate Hamas following its deadly assault on unsuspecting Israeli communities on Saturday, which killed more than 1,300 people, the deadliest attack on Israel since it was founded in 1948.
The Israeli airforce has launched intense bombing raids on Gaza over the past five days and is massing tens of thousands of troops along the border ahead of a possible ground invasion.
Gaza authorities said more than 1,400 Palestinians, mainly civilians, including children, have already been killed and more than 6,000 wounded. A land invasion in the densely populated territory could send the toll much higher.
Israeli Defence Minister Yoav Gallant played a video to his counterparts at NATO's Brussels headquarters that he said showed horrific scenes from the surprise Hamas attack.
"Children were tied up and shot. Yes, I repeat, children, tied up and shot," he told fellow ministers by video link according to a text of his address sent to Reuters.
'HORRIFIC PICTURES'
In a message on the social media site 'X', Netanyahu's office released what it said were "horrifying photos of babies murdered and burned by the Hamas monsters."
It added: "Hamas is inhuman. Hamas is ISIS," comparing the Palestinian group to the Islamic State, which was notorious for its brutality and gory execution videos.
The images of the dead infants were included in the video played to NATO. It was not released to the public, but was later seen by Reuters in Jerusalem. Reuters could not independently verify the material.
"They were horrific pictures of the attacks and the victims of the attacks," NATO Secretary General Jens Stoltenberg told reporters, saying it "confirmed the brutality of the attacks."
The White House said it had no reason to doubt the authenticity of the images.
Hamas has denied its militants harmed civilians, accusing Israel and the West of spreading false reports to incite violence against Palestinians.
Deputy Hamas chief, Saleh Al-Arouri, said the group's fighters had only aimed to attack the Israeli military and had been surprised by the swift collapse of army units. "The plan was to target the army's Gaza team and fight occupation soldiers only," Arouri said in quotes published by Hamas.
The video shown to NATO, apparently taken from a mix of social media published by Hamas and unidentified phone videos, showed the bodies of scores of dead civilians, as well as the body of an Israeli soldier in uniform with his head missing.
There were no images to suggest militants had beheaded babies -- a particularly explosive accusation that first emerged in Israel's media and initially confirmed by Israeli officials.
U.S. President Joseph Biden had suggested on Wednesday that he had seen images of children beheaded by militants. The White House later clarified that U.S. officials had not seen any evidence of this.
Netanyahu has not repeated a claim by his office earlier this week that Hamas had indeed cut off the heads of children, nor did Gallant repeat that accusation to NATO ministers.
But medics, international human rights organizations and journalists have documented that militants killed women, children and the elderly as well as young men and soldiers in their rampage.
Foreign reporters shown sites targeted by Hamas, witnessed ruins of burnt-out houses and streets scattered with dead residents and militants.
NATO officials said they did not expect the alliance to be directly involved in the conflict. But multiple NATO states, above all the United States, have offered Israel military aid.
U.S. Defense Secretary Lloyd Austin said after the NATO meeting on Thursday that Washington was not placing any conditions on its security assistance to Israel and expected Israel's professional military to 'do the right things.\''''
        ,
        ''''I would see and have confirmed pictures of terrorists beheading children,' Joe Biden decries Hamas atrocity in Israel
Reports suggest up to 40 babies slaughtered by Hamas near Gaza Strip. US President Joe Biden expresses horror at beheading of children by terrorists in Israel.
Biden spoke to Jewish leaders at the White House on Wednesday and said, "I never really thought that I would see and have confirmed pictures of terrorists beheading children. I never thought I'd ever — anyway." He did not take any questions from the reporters, but said he was working to bring home the Americans who were captured by Hamas and taken to Gaza.
"I haven't given up hope of bringing these folks home," he said.
"If I told you, I wouldn't be able to get them home."
According to reports, up to 40 babies were slaughtered in their homes near the Gaza Strip, which is controlled by Hamas.
While a senior White House national security aide stated they hadn't viewed the mentioned images, another White House official cited remarks made by Tal Heinrich, a spokesperson for Israeli Prime Minister Benjamin Netanyahu.
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with "decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response organization, Zaka, shared with CBS News that he witnessed the gruesome sight of children and infants who had been decapitated.
"I saw a lot more that cannot be described for now, because it's very hard to describe," he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to report the allegations of child beheadings on Tuesday. In a radio interview on Wednesday, she expressed her dismay at the initial public skepticism.
She stated, "I mean, babies' heads cut off. That's what they encountered when they came there. So as horrible as it is and and I wish that I it wasn't true."
The IDF also shared a disturbing image online on Wednesday that showed a blood-stained mattress of an Israeli child with blood spatter on the wall and the footboard.
Since Hamas' shocking attack on Israel, the US President has refrained from taking press questions in various settings. He is encountering mounting pressure from members of both parties in Congress to reconsider his decision to release $6 billion to Iran, amid reports that the funds may have been connected to the planning of the attack.'''
        ,
        '''At least 40 babies killed, beheaded in Israeli kibbutz outside Gaza Strip, reports say
KFAR AZA, Israel (TND) — Dozens of babies were reportedly found dead, including some that had been beheaded, in an Israeli kibbutz Tuesday after the terrorist organization Hamas stormed the community.
Several journalists were let in to the Kfar Aza kibbutz, located just outside the Gaza Strip, to see the aftermath of the attacks by Hamas. At least 70 residents of the kibbutz were killed by Hamas terrorists, according to Indian news website OpIndia.
Nicole Zedeck, a correspondent for Israeli television channel i24NEWS, described the scene as "truly horrific."
"No one could expect that it would be like this, the horrors that I'm hearing from these soldiers," Zedeck said. "As I mentioned earlier, about 40 babies, at least, were taken out on gurneys ... you continue to see just cribs overturned, strollers left behind, all of these doors left wide open."
Several of the infants were also beheaded by Hamas terrorists, according to OpIndia.
Zedeck went on to say that an official death count at the kibbutz is still unknown because soldiers are "still collecting dead bodies."
A kibbutz is a small Israeli agricultural community. Kibbutz are dotted throughout Israel, primarily in the Negev Desert.
Israel Defense Forces Major General Itai Veruv described the scene in Kfar Aza as a "massacre" Tuesday, calling it unlike something Israel has witnessed in "recent history."
"It's not a war, it's not a battlefield, it's a massacre," Veruv told The Times of Israel. "You see the babies, their mothers and their fathers, in their bedrooms, and in their protected rooms, and how the terrorists killed them ... It's something that I never saw in my life."
The murders at Kfar Aza represent just a fraction of the death and destruction caused by Hamas terrorists. Videos reviewed Monday afternoon show that at least four Israeli citizens were killed shortly after being taken hostage by the terrorist group.
An Israeli family of five was reportedly killed by Hamas terrorists during the invasion. The family, which included three children under 7, was discovered dead after the terrorist group infiltrated their bunker.
American families have begun to plead with the Biden administration for assistance finding their missing loved ones in Israel. At least 11 U.S. citizens were determined to have been killed in Israel as of Monday.'''
        ,
        ''''I would see and have confirmed pictures of terrorists beheading children,' Joe Biden decries Hamas atrocity in Israel
Reports suggest up to 40 babies slaughtered by Hamas near Gaza Strip. US President Joe Biden expresses horror at beheading of children by terrorists in Israel.
Biden spoke to Jewish leaders at the White House on Wednesday and said, "I never really thought that I would see and have confirmed pictures of terrorists beheading children. I never thought I'd ever — anyway." He did not take any questions from the reporters, but said he was working to bring home the Americans who were captured by Hamas and taken to Gaza.
"I haven't given up hope of bringing these folks home," he said.
"If I told you, I wouldn't be able to get them home."
According to reports, up to 40 babies were slaughtered in their homes near the Gaza Strip, which is controlled by Hamas.
While a senior White House national security aide stated they hadn't viewed the mentioned images, another White House official cited remarks made by Tal Heinrich, a spokesperson for Israeli Prime Minister Benjamin Netanyahu.
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with "decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency response organization, Zaka, shared with CBS News that he witnessed the gruesome sight of children and infants who had been decapitated.
"I saw a lot more that cannot be described for now, because it's very hard to describe," he said.
Nicole Zedek, a television reporter for i24NEWS based in Tel Aviv, was the initial source to report the allegations of child beheadings on Tuesday. In a radio interview on Wednesday, she expressed her dismay at the initial public skepticism.
She stated, "I mean, babies' heads cut off. That's what they encountered when they came there. So as horrible as it is and and I wish that I it wasn't true."
The IDF also shared a disturbing image online on Wednesday that showed a blood-stained mattress of an Israeli child with blood spatter on the wall and the footboard.
Since Hamas' shocking attack on Israel, the US President has refrained from taking press questions in various settings. He is encountering mounting pressure from members of both parties in Congress to reconsider his decision to release $6 billion to Iran, amid reports that the funds may have been connected to the planning of the attack.'''
        ]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
from sklearn.metrics.pairwise import linear_kernel

In [13]:
tfidf_matrix = tfidf_vectorizer.fit_transform(DATA)

In [10]:
# Extract the most representative sentence from each article
summaries = []
for idx, article in enumerate(DATA):
    # Tokenize the article into sentences
    sentences = article.split('. ')

    # Compute the TF-IDF vectors for the sentences
    sentence_vectors = tfidf_vectorizer.transform(sentences)

    # Compute cosine similarities between the article and its sentences
    cosine_similarities_sentences = linear_kernel(tfidf_matrix[idx:idx+1], sentence_vectors).flatten()

    # Find the sentence with the highest cosine similarity
    top_sentence_idx = cosine_similarities_sentences.argsort()[-1]
    top_sentence = sentences[top_sentence_idx]

    summaries.append(top_sentence)
merged_summary = '. '.join(summaries)

In [14]:
tfidf_vector_merged_summary = tfidf_vectorizer.transform([merged_summary])

In [11]:
sentences_merged_summary = merged_summary.split('. ')

sentence_vectors_merged_summary = tfidf_vectorizer.transform(sentences_merged_summary)

cosine_similarities_merged_summary = linear_kernel(tfidf_vector_merged_summary, sentence_vectors_merged_summary).flatten()

top_sentence_idx_merged = cosine_similarities_merged_summary.argsort()[-1]
top_sentence_merged = sentences_merged_summary[top_sentence_idx_merged]

print(top_sentence_merged)

I never thought I'd ever — anyway." He did not take any questions from the reporters, but said he was working to bring home the Americans who were captured by Hamas and taken to Gaza.
"I haven't given up hope of bringing these folks home," he said.
"If I told you, I wouldn't be able to get them home."
According to reports, up to 40 babies were slaughtered in their homes near the Gaza Strip, which is controlled by Hamas.
While a senior White House national security aide stated they hadn't viewed the mentioned images, another White House official cited remarks made by Tal Heinrich, a spokesperson for Israeli Prime Minister Benjamin Netanyahu.
On Wednesday, Heinrich quoted CNN that infants and young children had been discovered with "decapitated" bodies in the community of Kfar Aza.
An Israeli Defense Forces spokesperson also told The Intercept that they could not confirm it officially, but they believed the report.
Yossi Landau, a representative from Israel's volunteer civilian emergency

# Lab 8: Term Incidence Matrix

In [15]:
DOCS = ["I am a cow", "cow is what I am", "today is tuesday"]

In [24]:
words = set([word for doc in DOCS for word in nltk.word_tokenize(doc)])
words = list(words)
words.sort()
words

['I', 'a', 'am', 'cow', 'is', 'today', 'tuesday', 'what']

In [27]:
ti_matrix = {}
for doc in DOCS:
  row = []
  for word in words:
    if (word in nltk.word_tokenize(doc)):
      row.append(True)
    else:
      row.append(False)
  ti_matrix[doc] = row
ti_matrix

{'I am a cow': [True, True, True, True, False, False, False, False],
 'cow is what I am': [True, False, True, True, True, False, False, True],
 'today is tuesday': [False, False, False, False, True, True, True, False]}