In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
ds = load_dataset('imdb')

In [3]:
ds_train = pd.DataFrame(ds['train'])
ds_test = pd.DataFrame(ds['test'])

In [4]:
ds_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
ds_test.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [6]:
ds_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [7]:
ds_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [8]:
ds_train.describe()

Unnamed: 0,label
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [9]:
ds_test.describe()

Unnamed: 0,label
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [10]:
ds_train['label'].sum()

12500

In [11]:
ds_test['label'].sum()

12500

In [12]:
lengths_train = [len(text) for text in ds_train['text']]
average_train = float(sum(lengths_train)) / len(lengths_train)
max_train = max(lengths_train)
min_train = min(lengths_train)

print('Train Set:\nAverage length of review:', average_train, '\nMax length:', max_train, '\nMinimum length:', min_train)

Train Set:
Average length of review: 1325.06964 
Max length: 13704 
Minimum length: 52


In [13]:
lengths_test = [len(text) for text in ds_test['text']]
average_test = float(sum(lengths_test)) / len(lengths_test)
max_test = max(lengths_test)
min_test = min(lengths_test)

print('Test Set:\nAverage length of review:', average_test, '\nMax length:', max_test, '\nMinimum length:', min_test)

Test Set:
Average length of review: 1293.7924 
Max length: 12988 
Minimum length: 32


#### Removing Punctuation

In [14]:
def remove_punctuation(review):
    cleaned_text = review.translate(str.maketrans(' ',' ',string.punctuation))
    return cleaned_text

In [15]:
ds_train['text'] = ds_train['text'].apply(lambda x: remove_punctuation(x))
ds_test['text'] = ds_test['text'].apply(lambda x: remove_punctuation(x))

#### Tokenization

In [16]:
def tokenize(text):
    tokens = text.lower().split()
    return tokens

In [17]:
ds_train['text'] = ds_train['text'].apply(lambda x: tokenize(x))
ds_test['text'] = ds_test['text'].apply(lambda x: tokenize(x))

#### Stop Word Removal

In [18]:
stop_words = stopwords.words('english')

In [19]:
def remove_stopwords(text): 
    filtered_text = [word for word in text if word not in stop_words]
    return filtered_text

In [20]:
ds_train['text'] = ds_train['text'].apply(lambda x: remove_stopwords(x))
ds_test['text'] = ds_test['text'].apply(lambda x: remove_stopwords(x))

In [21]:
ds_train

Unnamed: 0,text,label
0,"[rented, curiousyellow, video, store, controve...",0
1,"[curious, yellow, risible, pretentious, steami...",0
2,"[avoid, making, type, film, future, film, inte...",0
3,"[film, probably, inspired, godards, masculin, ...",0
4,"[oh, brotherafter, hearing, ridiculous, film, ...",0
...,...,...
24995,"[hit, time, better, categorised, australian, c...",1
24996,"[love, movie, like, another, time, try, explai...",1
24997,"[film, sequel, barry, mckenzie, holds, two, gr...",1
24998,"[adventures, barry, mckenzie, started, life, s...",1


#### Lemmatization

In [22]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, 'v') for word in text]
    return lemmatized_words

In [23]:
ds_train['text'] = ds_train['text'].apply(lambda x: lemmatize_text(x))
ds_test['text'] = ds_test['text'].apply(lambda x: lemmatize_text(x))

In [24]:
ds_train['text'] = ds_train['text'].str.join(' ')
ds_test['text'] = ds_test['text'].str.join(' ')

In [25]:
vectorizer = TfidfVectorizer(strip_accents='unicode')

X_train = vectorizer.fit_transform(ds_train['text'])
X_test = vectorizer.transform(ds_test['text'])

y_train = ds_train['label']
y_test = ds_test['label']

In [26]:
model = xgb.XGBClassifier()

In [27]:
model.fit(X_train, y_train)

In [28]:
pred = model.predict(X_test)

In [29]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, pred)

In [30]:
print('Score:', score)

Score: 0.85316
