In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from bs4 import BeautifulSoup
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

In [34]:
df = pd.read_csv('NLPlabeledData.tsv',  delimiter="\t", quoting=3)

In [35]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [36]:
len(df)

25000

In [37]:
len(df['review'])

25000

In [38]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alpnn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
def process(review):
    review = BeautifulSoup(review).get_text()
    review = re.sub("[^a-zA-Z]", ' ', review)
    
    review = review.lower()
    review = review.split()
    
    swords = set(stopwords.words("english"))
    review = [w for w in review if w not in swords]
    
    return(" ".join(review))

In [40]:
train_x_tum = []

for r in range(len(df['review'])):
    if(r+1)%5000 == 0:
        print("No of reviews processed =",r+1)
    train_x_tum.append(process(df['review'][r]))

  review = BeautifulSoup(review).get_text()


No of reviews processed = 5000
No of reviews processed = 10000
No of reviews processed = 15000
No of reviews processed = 20000
No of reviews processed = 25000


In [41]:
x = train_x_tum
y = np.array(df["sentiment"])

train_x, test_x, y_train, y_test = train_test_split(x,y, test_size = 0.1)

In [42]:
vectorizer = CountVectorizer(max_features=3500)

train_x = vectorizer.fit_transform(train_x)

In [43]:
train_x

<22500x3500 sparse matrix of type '<class 'numpy.int64'>'
	with 1664129 stored elements in Compressed Sparse Row format>

In [44]:
train_x = train_x.toarray()
train_y = y_train

In [45]:
train_x.shape, train_y.shape

((22500, 3500), (22500,))

In [46]:
train_y

array([0, 1, 1, ..., 1, 1, 0], dtype=int64)

In [47]:
model = RandomForestClassifier(n_estimators= 100, random_state=42)

model.fit(train_x, train_y)

In [48]:
test_xx = vectorizer.transform(test_x)

In [49]:
test_xx

<2500x3500 sparse matrix of type '<class 'numpy.int64'>'
	with 186360 stored elements in Compressed Sparse Row format>

In [50]:
test_xx.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [51]:
test_xx.shape

(2500, 3500)

In [52]:
test_predict = model.predict(test_xx)

dogruluk = roc_auc_score(y_test, test_predict)

In [53]:
print("Doğruluk Oranı : %", dogruluk * 100)

Doğruluk Oranı : % 83.64669355354971


### max_features=3500 => 83.65
### max_features=5000 => 83.82