In [32]:
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split

***Loading the data from csv file***

In [33]:
df = pd.read_csv("Replaced.csv", encoding='ISO-8859-1')

***Splitting the dataset***

In [34]:
train, test = train_test_split(df, test_size=0.2)

*** Checking the size of the dataset i.e number of rows and columns  ***

In [35]:
train.shape

(56781, 22)

*** First record of the "text" column which contains the full reviews given by a user ***

In [36]:
print(train["text"][0])

i love this album. it's very good. more to the hip hop side than her current pop sound.. SO HYPE! i listen to this everyday at the gym! i give it 5star rating all the way. her metaphors are just crazy.


***To extract the data from HTML into text format we use BeautifulSoup***

In [37]:
example1 = BeautifulSoup(train["text"][0]) 



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


*** Displaying the clear text of the record ***

In [38]:
print(example1.get_text())

i love this album. it's very good. more to the hip hop side than her current pop sound.. SO HYPE! i listen to this everyday at the gym! i give it 5star rating all the way. her metaphors are just crazy.


***In order to keep the data clean, we remove the special characters and the punctuation marks used with the help of regular expression library***

In [39]:
import re   
letters_only = re.sub("[^a-zA-Z]", " ", example1.get_text() )  
print(letters_only)

i love this album  it s very good  more to the hip hop side than her current pop sound   SO HYPE  i listen to this everyday at the gym  i give it  star rating all the way  her metaphors are just crazy 


***Converting all the data to lowercase in order to keep a constant format***

In [57]:
lower_case = letters_only.lower()
words = lower_case.split()

In [41]:
lower_case

'i love this album  it s very good  more to the hip hop side than her current pop sound   so hype  i listen to this everyday at the gym  i give it  star rating all the way  her metaphors are just crazy '

 ***We use nltk for symbolic and statistical natural language processing (NLP) for English written in the Python programming language***

In [42]:
import nltk   
nltk.download('stopwords')   

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RIO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

*** we import the stopwords in order to check the words from our "text" column with them ***

In [43]:
from nltk.corpus import stopwords

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

***extracting special words (non common words) from the given row value***

In [44]:
words = [w for w in words if not w in stopwords.words("english")]  
print(words)

['love', 'album', 'good', 'hip', 'hop', 'side', 'current', 'pop', 'sound', 'hype', 'listen', 'everyday', 'gym', 'give', 'star', 'rating', 'way', 'metaphors', 'crazy']


***creating a function to clean the data and then add the non stop words to a list***

In [45]:
def review_to_words(raw_review):
    review_text = BeautifulSoup(raw_review).get_text()
    
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    words = letters_only.lower().split()
    
    stops = set(stopwords.words("english"))
    
    meaningful_words = [w for w in words if not w in stops]
    
    return(" ".join(meaningful_words))

***for a given row, removes the stopwords and special characters and writes the sentence***


In [46]:
clean_review = review_to_words(train["text"][0]) 
print(clean_review)

love album good hip hop side current pop sound hype listen everyday gym give star rating way metaphors crazy




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [47]:
num_reviews = train["text"].size   #(56781)
num_reviews

56781

*** Writing a loop to take up the first 3 records and goto the review_to_words(raw_review) function in order to extract the stopwords and display only the important terms ***

In [50]:
clean_train_reviews = []

for i in range(0, 3):
    clean_train_reviews.append(review_to_words(train["text"][i]))
    #len(clean_train_reviews)
    print (clean_train_reviews)

['love album good hip hop side current pop sound hype listen everyday gym give star rating way metaphors crazy']
['love album good hip hop side current pop sound hype listen everyday gym give star rating way metaphors crazy', 'good flavor review collected part promotion']
['love album good hip hop side current pop sound hype listen everyday gym give star rating way metaphors crazy', 'good flavor review collected part promotion', 'good flavor']




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


***Using countvectorizer we Convert a collection of text documents to a matrix of token counts***

* analyzer : string, {‘word’, ‘char’, ‘char_wb’} or callable
Whether the feature should be made of word or character n-grams. Option ‘char_wb’ creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space.

* tokenizer : callable or None (default)
Override the string tokenization step while preserving the preprocessing and n-grams generation steps. Only applies if analyzer == 'word'.

* preprocessor : callable or None (default)
Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams generation steps.

* stop_words : string {‘english’}, list, or None (default)
If ‘english’, a built-in stop word list for English is used.
If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if analyzer == 'word'.

* max_features : int or None, default=None
If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
This parameter is ignored if vocabulary is not None. 

In [53]:
print("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer    

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)

train_data_features = train_data_features.toarray()


Creating the bag of words...



In [54]:
print(train_data_features.shape)

(2, 24)


*** Applying the vector matrix (vectorizer) and calling out the function ***

In [55]:
vocab = vectorizer.get_feature_names()
print(vocab)

['album', 'collected', 'crazy', 'current', 'everyday', 'flavor', 'give', 'good', 'gym', 'hip', 'hop', 'hype', 'listen', 'love', 'metaphors', 'part', 'pop', 'promotion', 'rating', 'review', 'side', 'sound', 'star', 'way']


***Displaying the special words and their occurences in the given dataset by summig up the count and then mapping it to the term ***

In [24]:
import numpy as np
dist = np.sum(train_data_features, axis=0)

for tag, count in zip(vocab, dist):
    print(count, tag)

1 album
1 bought
1 boyfriend
1 burning
1 buying
2 captivating
1 caused
1 clean
1 collected
1 consistency
2 could
1 couples
1 crazy
1 current
1 difficult
2 disappointed
1 enhanced
1 especially
1 even
1 everyday
1 expecting
1 felt
2 flavor
3 gel
1 give
3 good
1 gym
1 hip
1 hop
1 however
1 husband
1 hype
1 irritation
1 lacked
2 less
1 like
1 liquid
1 listen
1 live
1 looking
1 love
1 lube
1 lubricant
1 lubricants
1 mess
1 messy
1 metaphors
1 money
1 much
1 neither
1 normal
2 notice
1 one
1 paid
1 part
1 personal
1 pleasant
1 pop
1 promotion
1 rating
2 read
1 recommend
1 reminiscent
1 review
2 reviews
2 sensation
1 side
1 since
1 skin
1 sort
1 sound
1 star
1 starters
1 ultimately
1 us
2 use
1 vaseline
1 way


The content of this project itself is licensed under the and the underlying source code used to format and display that content is licensed under the [MIT LICENSE](https://github.com/Jinansi/PythonProjects/blob/master/LICENSE)