In [106]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk

import re

import gc

In [107]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [108]:
tweets = pd.read_csv('/content/gdrive/MyDrive/NLP /IMDB Dataset.csv')
tweets.head(20)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [109]:
tweets.iloc[10,0 ]

'Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.<br /><br />At first it was very odd and pretty funny but as the movie progressed I didn\'t find the jokes or oddness funny anymore.<br /><br />Its a low budget film (thats never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.<br /><br />I imagine this film would appeal to a stoner who is currently partaking.<br /><br />For something similar but better try "Brother from another planet"'

In [110]:

# Target label encoding

le = LabelEncoder()
tweets['sentiment_encoded'] = le.fit_transform(tweets['sentiment'])
tweets.head(10)

Unnamed: 0,review,sentiment,sentiment_encoded
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
5,"Probably my all-time favorite movie, a story o...",positive,1
6,I sure would like to see a resurrection of a u...,positive,1
7,"This show was an amazing, fresh & innovative i...",negative,0
8,Encouraged by the positive comments about this...,negative,0
9,If you like original gut wrenching laughter yo...,positive,1


In [111]:
tweets['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [112]:
# base cleaning function: remove mentions, hashtags, digits, etc

def clean(text):
    text = text.lower()
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'<.*?>',' ', text)
    text = re.sub(r'[^\w\s]',' ', text)
    return text

In [113]:
text = tweets['review']
text.head(10)

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
5    Probably my all-time favorite movie, a story o...
6    I sure would like to see a resurrection of a u...
7    This show was an amazing, fresh & innovative i...
8    Encouraged by the positive comments about this...
9    If you like original gut wrenching laughter yo...
Name: review, dtype: object

In [114]:
# Apply base cleaning functions

text = text.apply(clean)
text.head(10)

0    one of the other reviewers has mentioned that ...
1    a wonderful little production    the filming t...
2    i thought this was a wonderful way to spend ti...
3    basically there s a family where a little boy ...
4    petter mattei s  love in the time of money  is...
5    probably my all time favorite movie  a story o...
6    i sure would like to see a resurrection of a u...
7    this show was an amazing  fresh   innovative i...
8    encouraged by the positive comments about this...
9    if you like original gut wrenching laughter yo...
Name: review, dtype: object

In [115]:
# Stopwords: non-informative words
# Lemmatizer: returns base or dictionary word form
# Tokenizer: splitting text into tokens (words)
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords


# from nltk.stem import 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# uncomment
import nltk
nltk.download('stopwords')
stopwords.words()

In [117]:
# create lemmatizer object
wn_lemmatizer = WordNetLemmatizer()

In [118]:
dirty_tweet = text.iloc[555]


In [119]:
print(f'''
      ** Original Tweet **: \n \n      
      {dirty_tweet}
      
      ------------------------------------------------
      
      ** WordNetLemmatizer: ** \n \n
      {' '.join([wn_lemmatizer.lemmatize(word) for word in dirty_tweet.split()])}
      ''')


      ** Original Tweet **: 
 
      
      i went to see  passion of mind  because i usually get a kick out of the genre of alternate reality romances  i e   sliding doors    me  myself  i   etc    but this was the worst one i ve ever seen  i had to force myself to sit through it  i didn t even stay through the credits which is unheard of for me   the magical realism was completely missing because demi moore was grim and the lovers she was two timing were guys who usually play villains  though each was kind of sexy and appealing   there was actually a psychological explanation provided for the dual lives  with a distasteful frisson of the elektra complex  maybe the magic shouldn t be explained for this genre to work    originally written       
      
      ------------------------------------------------
      
      ** WordNetLemmatizer: ** 
 

      i went to see passion of mind because i usually get a kick out of the genre of alternate reality romance i e sliding door me myself i

In [120]:
# Apply lemmatizer to tweets :
# 
lemmatized_text = []
for tweet in text:
    lemmatized_text.append(' '.join([wn_lemmatizer.lemmatize(word) for word in tweet.split()]))

In [121]:
# Create and apply tokenizer

reg_tokenizer = RegexpTokenizer('\w+')

In [122]:
tokenized_text = reg_tokenizer.tokenize_sents(lemmatized_text)
# tokenized_text[:2]

In [123]:
len(tokenized_text)

50000

In [124]:
# cache stopwords in variable
# delete stopwords from tweets
nltk.download('stopwords')
sw = stopwords.words()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
clean_tokenized_tweets = [] 
for i, element in enumerate(tokenized_text):
    if i % 2000 == 0: print(i, end = ' ')
    clean_tokenized_tweets.append(' '.join([word for word in element if word not in sw]))

0 2000 4000 6000 8000 10000 12000 14000 16000 18000 20000 22000 24000 26000 28000 30000 32000 34000 36000 38000 40000 42000 44000 46000 48000 

In [127]:
df = pd.concat([pd.Series(clean_tokenized_tweets, name='tweet'), 
                pd.Series(tweets['sentiment_encoded'], name='sentiment')], 
               axis=1)
df

Unnamed: 0,tweet,sentiment
0,reviewer mentioned watching oz episode hooked ...,1
1,wonderful little production filming technique ...,1
2,thought wa wonderful way spend time hot summer...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1
...,...,...
49995,thought movie right good job creative original...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,catholic taught parochial elementary school ta...,0
49998,going disagree previous comment side maltin se...,0


In [128]:
df.to_csv('clean_tweets', index=False)

In [129]:
#Постройте bag of words и tfidf представления токенов (пробуйте разный ngram_range)  sklearn.feature_extraction.text

cvec = CountVectorizer(ngram_range=(1, 2))  #Bag of words
tfid = TfidfVectorizer(ngram_range=(1, 2)) #tf_idf

In [130]:
cvec_representation = cvec.fit_transform(pd.Series(clean_tokenized_tweets))
tfid_representation = tfid.fit_transform(pd.Series(clean_tokenized_tweets))

In [131]:
cvec_representation

<50000x2922551 sparse matrix of type '<class 'numpy.int64'>'
	with 10240547 stored elements in Compressed Sparse Row format>

In [132]:
 x = cvec_representation
 y = df['sentiment']


In [133]:
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y , test_size=0.2, random_state=900)

In [None]:
# df.drop('sentiment', axis=1)

In [None]:
from gensim.models import Word2Vec
from gensim.parsing.preprocessing import STOPWORDS # more stopwords in gensim corpus
 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
# nltk.download('wordnet')
import re

from sklearn.manifold import TSNE

stop_nltk = stopwords.words('english') # nltk corpus

STOPWORDS = STOPWORDS.union(set(["don't", "i'm"]))

In [136]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [145]:
rf = RandomForestClassifier(max_depth = 10, n_estimators =200)

In [146]:
x_valid

<10000x2922551 sparse matrix of type '<class 'numpy.int64'>'
	with 2030499 stored elements in Compressed Sparse Row format>

In [147]:
rf.fit(x_train, y_train)

y_pred = rf.predict(x_valid)

In [148]:
print(f'RandomForestClassifier accuracy cvec:{accuracy_score(y_pred, y_valid)}')

RandomForestClassifier accuracy cvec:0.8248
