In [81]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer,\
WordNetLemmatizer


In [82]:
df1 = pd.read_csv('data/labeledTrainData.tsv', delimiter='\t')
df2 = pd.read_csv('data/imdb_master.csv', encoding='latin-1' )

In [83]:
df1.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [84]:
df1 = df1.drop(['id'],axis=1)

In [85]:
df2.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [86]:
df2 = df2.drop(['Unnamed: 0', 'type', 'file'], axis=1)

In [87]:
df2.columns = ['review', 'sentiment']

In [88]:
df2.sentiment.value_counts()

unsup    50000
pos      25000
neg      25000
Name: sentiment, dtype: int64

In [89]:
df2= df2[df2.sentiment != 'unsup']
df2['sentiment'] = df2['sentiment'].map({'pos':1, 'neg':0})

In [90]:
df2

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
...,...,...
49995,"Seeing as the vote average was pretty low, and...",1
49996,"The plot had some wretched, unbelievable twist...",1
49997,I am amazed at how this movie(and most others ...,1
49998,A Christmas Together actually came before my t...,1


In [91]:
df = pd.concat([df1, df2]).reset_index(drop=True)

In [92]:
df.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  75000 non-null  int64 
 1   review     75000 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.1+ MB


# 텍스트 전처리

In [94]:
stemmer = SnowballStemmer('english')

def clean_text(raw_review):
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    
    letters_only = re.sub('[^a-zA-z]', ' ', review_text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words('english'))
    
    meangingful_words = [w for w in words if not w in stops]
    
    stemming_words = [stemmer.stem(w) for w in meangingful_words]
    
    return (' '.join(stemming_words))

In [95]:
df['Processed_review'] = df['review'].apply(lambda x: clean_text(x))

In [96]:
df.Processed_review.apply(lambda x: len(x.split(' '))).mean()

119.05728

In [97]:
clean_train_reviews = df['Processed_review'].tolist()

In [98]:
# num_reviews = df['review'].size

# clean_train_reviews = []

# for i in range(num_reviews):
#     clean_train_reviews.append(clean_text(df['review'][i]))

In [99]:
len(clean_train_reviews)

75000

# 텍스트 vectorizing

In [100]:
from sklearn.feature_extraction.text import CountVectorizer


# Initialize the "CountVectorizer" object, 
# which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.

In [101]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([('vect', vectorizer),])

In [102]:
%time train_data_features=pipeline.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an array
print(train_data_features.shape)

CPU times: user 5.21 s, sys: 225 ms, total: 5.44 s
Wall time: 5.44 s
(75000, 5000)


In [122]:
vocab = vectorizer.get_feature_names()

In [104]:
dist = np.sum(train_data_features, axis=0)

for tag, count in zip(vocab, dist):
    print(count, tag)

[[ 225  844  343 ... 3360  470  223]] aaron


# train feature 학습

In [105]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)

%time forest = forest.fit(train_data_features, df['sentiment'])

CPU times: user 8min 20s, sys: 1.59 s, total: 8min 21s
Wall time: 46.6 s


In [107]:
# from sklearn.model_selection import cross_val_score
# %time np.mean(cross_val_score(forest, train_data_features, \
#                               df['sentiment'], cv=10, scoring='roc_auc'))

# test data prediction

In [109]:
test = pd.read_csv('data/testData.tsv', delimiter='\t')

test.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [110]:
clean_test_reviews = test['review'].apply(lambda x: clean_text(x)).tolist()


In [112]:
len(clean_test_reviews)

25000

In [114]:
%time test_data_features = pipeline.transform(clean_test_reviews)


CPU times: user 1.86 s, sys: 22 ms, total: 1.89 s
Wall time: 1.89 s


In [115]:
test_data_features

<25000x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 2043716 stored elements in Compressed Sparse Row format>

In [116]:
test_data_features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [117]:
result = forest.predict(test_data_features)

In [118]:
result

array([1, 0, 0, ..., 0, 1, 1])

# submission

In [119]:
output = pd.DataFrame(data={'id':test['id'], 'sentiment':result})
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,0
4,12128_7,1


In [120]:
output.shape

(25000, 2)

In [121]:
output.to_csv('data/prac2.csv', index=False, quoting=3)