In [2]:
import pandas as pd
from my_functions import *
from collections import Counter

import nltk
from nltk import FreqDist
from nltk.corpus import stopwords

# Get the original Datasets
df_train = pd.read_csv('D:\\Tensorflow\\imdb\\IMDB_Dataset.csv')

# Drop NaN 
df_train = df_train.dropna()

# Checking for nulls
df_train.isnull().any()

# Mapping dictionary for replacement of 'negative' with 0 and 'positive' with 1
mapping = {"negative": 0, "positive": 1}

# Replace values in the 'sentiment' column using the mapping dictionary
df_train['sentiment'] = df_train['sentiment'].replace(mapping)

print("Train Dataset:", len(df_train)) 

# Create a new column 'clean_review'
df_train['clean_review'] = df_train['review'].astype('str')

# Convert to lowercase 
df_train['clean_review'] = df_train.clean_review.str.lower()
df_train

Train Dataset: 50000


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,a wonderful little production. <br /><br />the...
2,I thought this was a wonderful way to spend ti...,1,i thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"petter mattei's ""love in the time of money"" is..."
...,...,...,...
49995,I thought this movie did a down right good job...,1,i thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,0,i am a catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,i'm going to have to disagree with the previou...


In [4]:
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: delete_newlines_and_tabs(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: strip_html_tags(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_links(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_dates_and_numbers(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_special_characters(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: reduce_character_repeatation(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_accented_characters(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_whitespace(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: expand_contractions(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_stopwords(text))
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: remove_dots_commas(text))


df_train



Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job creative original...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,0,going disagree previous comment side maltin on...


In [6]:
# Download the stopwords corpus
nltk.download('stopwords')

# Tokenize the text and create a frequency distribution
tokenized_words = word_tokenize(' '.join(df_train['clean_review']))
fdist = FreqDist(tokenized_words)

# Define the threshold for uncommon words
threshold = 6

# Get the set of uncommon words
uncommon_words = set(word for word, freq in fdist.items() if freq <= threshold)

# Remove uncommon words from the review column
df_train['clean_review'] = df_train['clean_review'].apply(lambda text: ' '.join([word for word in word_tokenize(text) if word not in uncommon_words]))

# Display the modified DataFrame
df_train

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nasra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching oz episode ho...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake thinks zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,mattei love time money visually stunning film ...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job creative original...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,catholic taught elementary schools nuns taught...
49998,I'm going to have to disagree with the previou...,0,going disagree previous comment side maltin on...


In [10]:
# Print some examples
for i in range(10):
    print('Raw Text:', df_train['review'][i])
    print()
    print('Clean Text:', df_train['clean_review'][i])
    print('-----------')
    print()

Raw Text: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due 

In [11]:
# Drop original review column and rename the new clean_review to review
df_train = df_train.drop(columns = ['review'])
df_train = df_train.rename(columns={"clean_review":"review"})
df_train

Unnamed: 0,sentiment,review
0,1,one reviewers mentioned watching oz episode ho...
1,1,wonderful little production filming technique ...
2,1,thought wonderful way spend time hot summer we...
3,0,basically family little boy jake thinks zombie...
4,1,mattei love time money visually stunning film ...
...,...,...
49995,1,thought movie right good job creative original...
49996,0,bad plot bad dialogue bad acting idiotic direc...
49997,0,catholic taught elementary schools nuns taught...
49998,0,going disagree previous comment side maltin on...


In [12]:
# Save dataframe to a new CSV file 
df_train.to_csv('D:\\Tensorflow\\imdb\\imdb_train_clean.csv', index=False)