<a href="https://colab.research.google.com/github/JuneC7020/NLP_Projects/blob/main/a35338mc_Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk, re, pprint
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.cluster import GAAClusterer

In [2]:
from nltk import download

download("stopwords")
download("wordnet")
download("punkt")
download("omw-1.4")
download("averaged_perceptron_tagger")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')
# connecting Google Drive for reading file of the training data

Mounted at /content/drive


In [22]:
import numpy as np
import pandas as pd # for reading csv files
import string

In [5]:
def wordnet_tagger(nltk_tag):
    """
    Take a nltk POS Tag, then convert it into a equivalent wordnet tag.
    It will be used for wordnet lemmatization
    """
    if nltk_tag == None:
        return None

    if nltk_tag.startswith("J"):
        return wordnet.ADJ
    elif nltk_tag.startswith("V"):
        return wordnet.VERB
    elif nltk_tag.startswith("N"):
        return wordnet.NOUN
    elif nltk_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None

In [25]:
# To remove punctuations and other none useful tokens
punc_to_empty_table = str.maketrans(
    "",
    "",
    # characters to be removed
    '''
    !"#$%&'()*+, -./:;<=>?@[\]^_`{|}~,
    '''
)

In [6]:
FILE_PATH = "/content/drive/MyDrive/COMP34711_NLP/NLP_CW/NLP_dataset"
tr_data = pd.read_csv(FILE_PATH + '/Training-dataset.csv')

In [7]:
tr_data.head() # checking the data of the file

Unnamed: 0,ID,title,plot_synopsis,comedy,cult,flashback,historical,murder,revenge,romantic,scifi,violence
0,8f5203de-b2f8-4c0c-b0c1-835ba92422e9,Si wang ta,"After a recent amount of challenges, Billy Lo ...",0,0,0,0,1,1,0,0,1
1,6416fe15-6f8a-41d4-8a78-3e8f120781c7,Shattered Vengeance,"In the crime-ridden city of Tremont, renowned ...",0,0,0,0,1,1,1,0,1
2,4979fe9a-0518-41cc-b85f-f364c91053ca,L'esorciccio,Lankester Merrin is a veteran Catholic priest ...,0,1,0,0,0,0,0,0,0
3,b672850b-a1d9-44ed-9cff-025ee8b61e6f,Serendipity Through Seasons,"""Serendipity Through Seasons"" is a heartwarmin...",0,0,0,0,0,0,1,0,0
4,b4d8e8cc-a53e-48f8-be6a-6432b928a56d,The Liability,"Young and naive 19-year-old slacker, Adam (Jac...",0,0,1,0,0,0,0,0,0


In [None]:
print(tr_data.info())

In [9]:
tr_plots = tr_data["plot_synopsis"]
tr_ids = tr_data["ID"]

In [26]:
### About the data inside the plot synopsis ###

# @
# @ is found only in two cases: an email, a name of something

# IMPORTANT!!
# ()
# There are a lot of brackets, mostly containing Actors' name or a discription of the story
# Since the Name of the actor is not related to the synopsis itself,
# I could be better to remove it or store them elsewhere

#;
# many use of ; for further explaination

#'
# uses of 've, 'm, 's, n't
# they are all stopwords and since sentiment is not the key point of the genre of the movies it is better remove them

#numbers
# not related to task so removing it

In [12]:
stop_words = set(stopwords.words())
w_lemmatizer = WordNetLemmatizer()

In [19]:
tokens = []
for plot in tr_plots:
  tokens.append(word_tokenize(plot))

In [21]:
print(tokens[0:20])



In [27]:
### Tokenization and Text Preprocessing ###

processed_plots = []

for plot in tr_plots:
  # Tokenization
  plot_tokens = word_tokenize(plot)

  # nltk POS tagging
  tagged_plot_tokens = pos_tag(plot_tokens)

  # nltk tag -> wordnet tag
  wordnet_tagged_tokens = [(t[0], wordnet_tagger(t[1])) for t in tagged_plot_tokens]

  # Lemmatization
  lemmatized_tokens = [
      word if tag is None else w_lemmatizer.lemmatize(word, tag)
                for word, tag in wordnet_tagged_tokens
  ]

  #Unecessary Tokens Removal
  #Stopwords Removal
  sw_removed_lm_tokens = [
      word for word in lemmatized_tokens if not word in stop_words
  ]

  #Punctuations and Contraction Removal
  PnC_removed_lm_tokens = [
      word.translate(punc_to_empty_table)
      for word in sw_removed_lm_tokens
      if word!= "" #Removal of Contraction
      and word != "'m"
      and word != "'re"
      and word != "'ve"
      and word != "n't"
  ]

  #Numerical tokens Removal
  rm_lm_tokens = [
      token
      for token in PnC_removed_lm_tokens
      if token != "" and token.isnumeric() == False
  ]


  processed_plots += [rm_lm_tokens]

In [28]:
# Counting words from all of plots
w_counts = {}
for plot in processed_plots:
  for word in plot:
    if word in w_counts:
      w_counts[word] += 1
    else:
      w_counts[word] = 1

# w_count = {'After': 7896, 'recent': 376, 'amount': 362, 'challenge': 736, 'Billy': 1550, 'Lo': 128....}

In [34]:
# Sorting w_count by the frequency of the word
sorted_w_counts = sorted(
    [[key, value] for key, value in w_counts.items()], key=lambda val: val[1], reverse=True
)

# sort_w_counts = [['s', 85099], ['The', 39875], ['He', 26012], ['find', 21139], ['kill', 17487], ['leave', 16712],..]

In [35]:
# Acquiring top 50 words its counts
top_50_words_counts = []
i = 0
while len(top_50_words_counts) < 50:
    if "".join(reversed(sorted_w_counts[i][0])) != sorted_w_counts[i][0]:
        top_50_words_counts.append(
            (sorted_w_counts[i][0], sorted_w_counts[i][1])
        )
    i += 1

# top_50_words_counts = ('give', 9572), ('time', 9310), ('In', 9286), ('When', 9205), ('return', 9058),...

In [38]:
# Top-50 words
top_50_words = [item[0] for item in top_50_words_counts]

# top_50_words = ['The', 'He', 'find', 'kill', 'leave', 'She', 'back', ..

In [37]:
# Pseudo made-up words
reverse_top_50_words = ["".join(reversed(word)) for word in top_50_words]

#  reverse_top_50_words = ['ehT', 'eH', 'dnif', 'llik', 'evael', 'ehS', 'kcab', 'ekam', 'yehT' ..