In [11]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup


In [2]:
train = pd.read_csv('data/train.tsv', delimiter='\t')
test = pd.read_csv('data/test.tsv', delimiter='\t')

In [3]:
train.shape, test.shape

((156060, 4), (66292, 3))

In [4]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [10]:
train.loc[446]['Phrase']

'is a sweet and modest and ultimately winning story'

# text preprocessing

In [15]:
stemmer = SnowballStemmer('english')

def clean_text(raw_reviews):
    clean_reviews = BeautifulSoup(raw_reviews, 'html.parser').get_text()
    letters_only = re.sub('[^a-zA-z]', ' ', clean_reviews)
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if not w in stops]
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    
    return (' '.join(stemming_words))



In [16]:
%time train['Processed_phrase'] = train['Phrase'].apply(lambda x: \
                                    clean_text(x))


  ' Beautiful Soup.' % markup)


CPU times: user 36.8 s, sys: 5.4 s, total: 42.2 s
Wall time: 42.4 s


In [17]:
clean_train_data = train['Processed_phrase'].tolist()

In [20]:
len(clean_train_data)

156060

# vectorizing

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(analyzer='word',
                            tokenizer=None,
                            preprocessor=None,
                            stop_words=None,
                            max_features=5000)



In [22]:
pipeline = Pipeline([('vect', vectorizer),])

In [23]:
%time test_data_features = pipeline.fit_transform(clean_train_data)

CPU times: user 670 ms, sys: 13.4 ms, total: 683 ms
Wall time: 683 ms


In [24]:
test_data_features

<156060x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 567140 stored elements in Compressed Sparse Row format>

# fit

In [25]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100,
                               n_jobs=-1,
                               random_state=42)

In [26]:
forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [27]:
forest = forest.fit(test_data_features, train['Sentiment'])

# test data prediction

In [28]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [29]:
test['Processed_phrase'] = test['Phrase'].apply(lambda x: clean_text(x))

  ' Beautiful Soup.' % markup)


In [30]:
test['Processed_phrase']

0        intermitt pleas most routin effort
1        intermitt pleas most routin effort
2                                          
3        intermitt pleas most routin effort
4               intermitt pleas most routin
                        ...                
66287            long wind predict scenario
66288            long wind predict scenario
66289                             long wind
66290                             long wind
66291                      predict scenario
Name: Processed_phrase, Length: 66292, dtype: object

In [31]:
clean_test_data = test['Processed_phrase'].tolist()

In [32]:
len(clean_test_data)

66292

In [33]:
test_data_features = pipeline.transform(clean_test_data)

In [34]:
test_data_features

<66292x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 213791 stored elements in Compressed Sparse Row format>

In [35]:
test_data_features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [49]:
prediction = forest.predict(test_data_features)

In [50]:
prediction

array([2, 2, 2, ..., 1, 1, 1])

In [42]:
submission = pd.read_csv('data/sampleSubmission.csv')

In [46]:
submission.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [44]:
submission['Sentiment'] = prediction

In [47]:
submission.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [51]:
submission.to_csv('data/sub1.csv', index=False)