In [14]:
import pandas as pd       
from bs4 import BeautifulSoup  
import re
import nltk
nltk.download("stopwords") 
from nltk.corpus import stopwords
from tqdm import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from IPython.display import FileLink

from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package stopwords to /home/docker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def describe(df):
    print(df.shape)
    display(df.head())
    
    
def train_test_split(data_df, train_percentage):
    nr_train = int(train_percentage * len(data_df))
    train_df = data_df[:nr_train]
    test_df = data_df[nr_train:]
    
    return train_df, test_df

def construct_features(reviews, vectorizer):
    return vectorizer.transform(reviews).toarray()

# Read Data

In [3]:
data_path = "/home/docker/apollo/datasets/nlp/imdb/labeledTrainData.tsv"

In [4]:
data_df = pd.read_csv(data_path, header=0, delimiter="\t", quoting=3)
describe(data_df)

(25000, 3)


Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


# Process Data

In [5]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review,"lxml").get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]   
    return " ".join( meaningful_words )

In [6]:
data_df.loc[:,"cleaned_review"] = data_df.loc[:,"review"].progress_apply(review_to_words)
describe(data_df)

100%|██████████| 25000/25000 [00:16<00:00, 1560.53it/s]

(25000, 4)





Unnamed: 0,id,sentiment,review,cleaned_review
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment mj started listening music ...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy wondrously unpretentious explo...


# Split Train Test

In [7]:
train_df, test_df = train_test_split(data_df, 0.7)

train_reviews = train_df["cleaned_review"].tolist()
test_reviews = test_df["cleaned_review"].tolist()

print(train_df.shape)
print(test_df.shape)

(17500, 4)
(7500, 4)


# Construct Features

In [15]:
# vectorizer = CountVectorizer(analyzer = "word",   \
#                              max_features = 10000)

vectorizer = TfidfVectorizer(analyzer = "word",   \
                             max_features = 5000)


vectorizer = vectorizer.fit(train_reviews)

In [16]:
train_data_features = construct_features(train_reviews, vectorizer)
test_data_features = construct_features(test_reviews, vectorizer)

print(train_data_features.shape)
print(test_data_features.shape)


(17500, 5000)
(7500, 5000)


In [17]:
train_labels = train_df["sentiment"]
test_labels = test_df["sentiment"]

print(train_labels.shape)
print(test_labels.shape)

(17500,)
(7500,)


# Train Classifier

In [18]:
rf_classifier = RandomForestClassifier(n_estimators = 100) 
rf_classifier = rf_classifier.fit(train_data_features, train_labels)

In [19]:
test_preds = rf_classifier.predict(test_data_features)

In [20]:
score = roc_auc_score(test_labels, test_preds)
score

0.851233919950529

# Make a submission

In [21]:
pred_path  = "/home/docker/apollo/datasets/nlp/imdb/testData.tsv"

In [23]:
pred_df = pd.read_csv(pred_path, header=0, delimiter="\t", quoting=3 )
describe(pred_df)

(25000, 2)


Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [24]:
pred_df.loc[:,"cleaned_review"] = pred_df.loc[:,"review"].progress_apply(review_to_words)

100%|██████████| 25000/25000 [00:15<00:00, 1609.76it/s]


In [26]:
pred_reviews = pred_df["cleaned_review"].tolist()

In [27]:
pred_data_features = construct_features(pred_reviews, vectorizer)
pred_data_features.shape

(25000, 5000)

In [28]:
predictions = rf_classifier.predict(pred_data_features)
predictions.shape

(25000,)

In [29]:
output_path = "./bow_features=5k_tfidf.csv"
output_df = pd.DataFrame( data={"id":pred_df["id"], "sentiment":predictions} )
output_df.to_csv(output_path, index=False, quoting=3)

In [30]:
FileLink(output_path)