<a href="https://colab.research.google.com/github/JehadOumer/IMDB-Reviews-Classification/blob/main/DataWrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
processor= spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## fetching the dataset from tensorflow, splitting to training, testing and validation

In [None]:
(training_data, validation_data, testing_data), ds_info  = tfds.load(
    name="imdb_reviews", split=('train+test[0:5000]', 'test[5000:15000]', 'test[15000:]'), as_supervised=True, with_info=True)
##The label is an integer value of either 0 or 1, where 0 is a negative review, and 1 is a positive review.

In [None]:
##convert to pandas data frame
training_data = tfds.as_dataframe(training_data, ds_info)
validation_data= tfds.as_dataframe(validation_data, ds_info)
testing_data=tfds.as_dataframe(testing_data, ds_info)
all_data = pd.concat([training_data, testing_data, validation_data])

## SpaCy for NER and Lemmatization

In [None]:

def spacy_processor(review, processor):
  doc = processor(review)
  lemmas_list = [token.lemma_ for token in doc]
  temp = ' '.join(lemmas_list)

  for ent in doc.ents:
    temp=re.sub(r'\b{}\b'.format(re.escape(str(ent.text))), str(ent.label_), temp)

  return temp


## Preprocessing function

In [None]:

def preprocess_review(review, processor):
    temp = review.strip()
    temp = temp.replace('\n', ' ')
    temp = temp.replace('<br />', ' ')
    temp = temp.replace('\\', '')
    temp = re.sub(r'^(b\'|b\")', '',temp)
    temp = re.sub(r'(\'|\")$', '',temp)
    temp = str(spacy_processor(temp, processor))
    temp = re.sub(r'[^a-zA-Z0-9]', ' ', temp)
    temp = re.sub(' +', ' ', temp)
    temp = temp.lower().strip()

    return temp



    

In [None]:
#Demonstration
for i in  range(5):
  print(training_data["text"][i])
  print(preprocess_review(str(training_data["text"][i]), processor))
  print("")


b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."
this be an absolutely terrible movie do not be lure in by person or person both be great actor but this must simply be pron bad role in history even pron great acting could not redeem this movie s ridiculous storyline this movie be an early ninety gpe propaganda piece the most pathetic 

### Saving preprocessed Dataset as .CSV files

In [None]:
training_data['text'] = training_data['text'].map(lambda x: preprocess_review(str(x), processor))
training_data.to_csv('/processed_training_data.csv')

validation_data['text'] = validation_data['text'].map(lambda x: preprocess_review(str(x),processor))
validation_data.to_csv('/processed_validation_data.csv')

testing_data['text'] = testing_data['text'].map(lambda x: preprocess_review(str(x), processor))
testing_data.to_csv('/processed_testing_data.csv')

In [None]:
whole_processed_data = pd.concat([training_data, validation_data, testing_data])
whole_processed_data.to_csv('/whole_processed_data.csv')

Link to the processed datasets 
[Google Drive Folder](https://drive.google.com/drive/folders/1eXCogA-lD5z-rx-NJYMGEsUxZOLqr1av?usp=sharing)