In [100]:
import pandas as pd

In [101]:
from sklearn.feature_extraction.text import CountVectorizer

In [102]:
data=pd.read_csv('labeledTrainData.tsv', sep='\t')

In [103]:
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [104]:
data['id'].value_counts()

5814_8     1
9687_9     1
5876_1     1
4196_9     1
9248_3     1
          ..
699_8      1
1049_7     1
10302_1    1
8595_1     1
8478_8     1
Name: id, Length: 25000, dtype: int64

In [105]:
data['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [106]:
data['review'][1]

'\\The Classic War of the Worlds\\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells\' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different things in a movie. Those who envision themselves as amateur \\"critics\\" look only to criticize everything they can. Others rate a movie on more important bases,like being entertained, which is why most people never agree with the \\"critics\\". We enjoyed the effort Mr. Hines put into being faithful to H.G. Wells\' classic novel, and we found it to be very entertaining. This made it easy to overlook what the \\"critics\\" perceive to be its shortcomings."'

In [107]:
data.shape

(25000, 3)

In [108]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [84]:
###text preprocessing using gensim
from gensim.utils import simple_preprocess

In [85]:
preprocess_data = data['review'].apply(lambda x: simple_preprocess(x))

In [86]:
preprocess_data

0        [with, all, this, stuff, going, down, at, the,...
1        [the, classic, war, of, the, worlds, by, timot...
2        [the, film, starts, with, manager, nicholas, b...
3        [it, must, be, assumed, that, those, who, prai...
4        [superbly, trashy, and, wondrously, unpretenti...
                               ...                        
24995    [it, seems, like, more, consideration, has, go...
24996    [don, believe, they, made, this, film, complet...
24997    [guy, is, loser, can, get, girls, needs, to, b...
24998    [this, minute, documentary, buñuel, made, in, ...
24999    [saw, this, movie, as, child, and, it, broke, ...
Name: review, Length: 25000, dtype: object

# Missing Values

In [109]:
data.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [110]:
from sklearn.preprocessing import LabelEncoder

In [111]:
cvec = CountVectorizer(stop_words = "english", min_df = 10, max_df = 200, max_features = 500)

In [112]:
bow = cvec.fit_transform(data['review'])

In [113]:
bow

<25000x500 sparse matrix of type '<class 'numpy.int64'>'
	with 83594 stored elements in Compressed Sparse Row format>

In [114]:
len(cvec.vocabulary_)

500

In [115]:
x = pd.DataFrame(bow.todense())

In [116]:
x.shape

(25000, 500)

In [117]:
y = data['sentiment']

In [118]:
from sklearn.model_selection import train_test_split

In [119]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.15, random_state = 134)

In [120]:
from sklearn.linear_model import LogisticRegression

In [121]:
lr = LogisticRegression()

In [122]:
lr.fit(x_train, y_train)

In [123]:
y_pred = lr.predict(x_test)

In [124]:
from sklearn.metrics import accuracy_score

In [125]:
accuracy_score(y_test, y_pred)

0.6448

# TFIDF

In [126]:
tfvec = TfidfVectorizer(stop_words = "english", min_df = 10, max_df = 500, max_features = 500)

In [127]:
tfidf = tfvec.fit_transform(data['review'])

In [128]:
x = pd.DataFrame(tfidf.todense())

In [129]:
x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.284741,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,0.0,0.0,0.0,0.0,0.510268,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.15, random_state = 134)

In [131]:
lr = LogisticRegression()

In [132]:
lr.fit(x_train, y_train)

In [133]:
y_pred = lr.predict(x_test)

In [134]:
accuracy_score(y_test, y_pred)

0.708