In [23]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [24]:
# read data
data = pd.read_parquet("test-00000-of-00001.parquet")

In [25]:
# see head of data
data.head()

Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [26]:
# As we can see there is no nan value in our dataset
data.isna().sum()

text     0
label    0
dtype: int64

In [27]:
# the dataset has 50000 rows and 2 columns
data.shape

(25000, 2)

In [28]:
# Surprisingly there are some duplicate comments
data.nunique()

text     24801
label        2
dtype: int64

In [29]:
uniq_data_rev = np.unique(data["label"],return_counts=True)
review = uniq_data_rev[0]
count = uniq_data_rev[1]

In [30]:
# Now we can see duplicate comments below:
review[count > 1]

array([0, 1], dtype=int64)

In [31]:
# let's remove (drop) duplicates rows
data.drop_duplicates(inplace=True)
data.reset_index(drop=True,inplace=True)

In [32]:
# data shape after drop duuplicates (now it has 49582 rows instead of 50000)
data.shape

(24801, 2)

In [33]:
# this library converts any word to its stem (مصدر)
stem = PorterStemmer()

In [34]:
# punctuation in english like . " ; ' and others should be removed from text
# here we got a list of english punctuations to not consider them
punctuations = list(string.punctuation)

In [35]:
#stop words like "and" should be removed from text because they won't help the algorithm to work better
# and they increase the number of features (they are useless features)
# here we got a list of english stopwords
stop_words = stopwords.words("english")

In [36]:
#let's make a function which do "text_clearing"
def text_cleaner(text):
    # make text lower case
    lowtext = text.lower()
    
    #clear html tags from text
    no_htmltag_text = BeautifulSoup(lowtext).getText()
    
    # word tokenize text
    tokens = word_tokenize(no_htmltag_text)
    
    # fine_tokens will contains words which are sutable to be a feature
    fine_tokens = list()
    for tok in tokens:
        if (not tok.isdigit()) and (tok not in punctuations) and (tok not in stop_words):
            stemed_tok= stem.stem(tok)
            fine_tokens.append(stemed_tok)
    cleaned_text = fine_tokens
    return cleaned_text

In [37]:
# just an example of how text_cleaner def works
text_cleaner("A wonderful little production. <br /><br />The...")

['wonder', 'littl', 'product', '...']

In [38]:
# apply the function on data (clearing text of review column of data)
data['text'] = data['text'].apply(text_cleaner)

  no_htmltag_text = BeautifulSoup(lowtext).getText()


In [40]:
data.head()

Unnamed: 0,text,label
0,"[love, sci-fi, will, put, lot, sci-fi, movies/...",0
1,"[worth, entertain, valu, rental, especi, like,...",0
2,"[total, averag, film, semi-alright, action, se...",0
3,"[star, rate, saturday, night, friday, night, f...",0
4,"[first, let, say, n't, enjoy, van, damm, movi,...",0


In [41]:
from gensim.models import Word2Vec

sentences = data["text"][:30]

model = Word2Vec(sentences, min_count=5)



In [44]:
model.wv.vocab

{'sci-fi': <gensim.models.keyedvectors.Vocab at 0x1c8b61463a0>,
 'put': <gensim.models.keyedvectors.Vocab at 0x1c8b61467f0>,
 'lot': <gensim.models.keyedvectors.Vocab at 0x1c8b6146220>,
 'tri': <gensim.models.keyedvectors.Vocab at 0x1c8b61463d0>,
 'like': <gensim.models.keyedvectors.Vocab at 0x1c8b6146490>,
 'realli': <gensim.models.keyedvectors.Vocab at 0x1c8b6146430>,
 'good': <gensim.models.keyedvectors.Vocab at 0x1c8b6146cd0>,
 'star': <gensim.models.keyedvectors.Vocab at 0x1c8b6146100>,
 'cheap': <gensim.models.keyedvectors.Vocab at 0x1c8b6146c70>,
 'dialogu': <gensim.models.keyedvectors.Vocab at 0x1c8b61467c0>,
 "n't": <gensim.models.keyedvectors.Vocab at 0x1c8b6146a90>,
 'charact': <gensim.models.keyedvectors.Vocab at 0x1c8b61464f0>,
 'think': <gensim.models.keyedvectors.Vocab at 0x1c8b6146640>,
 "'s": <gensim.models.keyedvectors.Vocab at 0x1c8b6146bb0>,
 'us': <gensim.models.keyedvectors.Vocab at 0x1c8b6146130>,
 'take': <gensim.models.keyedvectors.Vocab at 0x1c8b6146670>,
 'ma

In [45]:
#spliting data to train and test sample
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.333,random_state=42)