## Word Embedding or Vectorization

In [36]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import warnings as wr
wr.filterwarnings('ignore')

In [21]:
# task 1: load Data
def load_date(file_path):
    df = pd.read_csv(file_path, header=None,
                    encoding='latin-1', names=['target','id','date','meta','user','text']
                    )
    df['sentiment'] = df['target'].apply(lambda x: 'positive' if x==4 else 'negative')
    return df

file_path = "E:/M60/nlp_dataset/sentiment140_v1.csv"
df = load_date(file_path)

In [22]:
df.head()

Unnamed: 0,target,id,date,meta,user,text,sentiment
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,negative
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,negative
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,negative
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",negative


In [4]:
df.shape

(910, 7)

### Stopwords

In [5]:
default_stopwords = set(stopwords.words('english'))
# print Standards Stopwords
print("Default Stopwords:")
print(sorted(default_stopwords))

Default Stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "should

## Cleaning

In [23]:
def clean_prep_text(text):
    # Convert to lower case
    text = text.lower()
    # print("Lowercase: ", text)
    
    # Remove Stopwords
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Print ("Remove Punctuation: ",text)
    # Tokenize the words
    tokens = word_tokenize(text)
    # Remove Stopwords
    stop_words = set(stopwords.words('english'))
    # Print ("Remove stopwords: ", text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens) 

In [24]:
df['cleaned_text'] = df['text'].apply(clean_prep_text)

In [25]:
df

Unnamed: 0,target,id,date,meta,user,text,sentiment,cleaned_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative,switchfoot httptwitpiccom2y1zl awww thats bumm...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,negative,upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,negative,kenichan dived many times ball managed save 50...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,negative,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",negative,nationwideclass behaving im mad cant see
...,...,...,...,...,...,...,...,...
905,4,1467901707,Mon Apr 06 22:43:52 PDT 2009,NO_QUERY,weboword,@tothepc Did you check out http://www.weboword...,positive,tothepc check httpwwwwebowordcom arent diction...
906,4,1467901716,Mon Apr 06 22:43:53 PDT 2009,NO_QUERY,meganfinley,Artistic affirmation from a drunk lady was kin...,positive,artistic affirmation drunk lady kinda needed
907,4,1467901742,Mon Apr 06 22:43:53 PDT 2009,NO_QUERY,konelli,@Honey3223 Honey goodnight I am up really ear...,positive,honey3223 honey goodnight really early morning...
908,4,1468004484,Mon Apr 06 23:12:59 PDT 2009,NO_QUERY,l_eau,@Jon_Favreau &quot;never worked on a sequel&qu...,positive,jonfavreau quotnever worked sequelquot maybe t...


## Split

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X = df['cleaned_text'].tolist()
y = df['sentiment'].tolist()

In [31]:
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=.20, random_state=0)

In [34]:
print(f'X_Train Shape : {len(X_train)}, y_Train Shape : {len(y_train)}')
print(f'X_test Shape : {len(X_test)}, y_test Shape :{len(y_test)}')

X_Train Shape : 728, y_Train Shape : 728
X_test Shape : 182, y_test Shape :182


In [35]:
print(type(X_train))

<class 'list'>


## Embedding

In [48]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoder

In [41]:
print(y_train) # its a list, but One hot expect 2D array, so conversion requires- firs need to convert into 'Numpy' array

['negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'positive', 'negative', 'negative', 'positive', 'negative', 'negative', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative', 'negative', 'negative', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'negative', 'negative', 'negative', 'positive', 'positive', 'positive', 'positive', 'negative', 'negative', 'po

In [43]:
#Convert into numpy array - when converted into numpy, it automatically being a 1D array, but still need 2D array, use reshape
y_array = np.array(y_train)
y_array

array(['negative', 'positive', 'positive', 'negative', 'positive',
       'negative', 'negative', 'negative', 'positive', 'negative',
       'negative', 'negative', 'negative', 'positive', 'positive',
       'negative', 'positive', 'positive', 'positive', 'negative',
       'negative', 'negative', 'negative', 'negative', 'positive',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'negative', 'positive', 'negative', 'positive', 'positive',
       'positive', 'negative', 'negative', 'positive', 'negative',
       'negative', 'positive', 'negative', 'positive', 'negative',
       'negative', 'negative', 'positive', 'negative', 'positive',
       'positive', 'negative', 'negative', 'negative', 'negative',
       'negative', 'negative', 'positive', 'negative', 'positive',
       'positive', 'negative', 'positive', 'negative', 'positive',
       'negative', 'positive', 'positive', 'negative', 'positive',
       'negative', 'negative', 'negative', 'positive', 'negati

In [44]:
# 2D conversion Done
onehot_encoded_sparse = encoder.fit_transform(y_array.reshape(-1, 1)) # fit is learing algo, 1: Positive, 0: Negative
print(onehot_encoded_sparse)

[[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [46]:
# Sparse : True
#from sklearn.preprocessing import OneHotEncoder
#encoder = OneHotEncoder(sparse=True)
#encoder

In [47]:
# Sparse True look like this (row, col)- sparse False when you no need to see index of row, col like (0,1, 1,1 etc)
#onehot_encoded_sparse = encoder.fit_transform(y_array.reshape(-1, 1)) # fit is learing algo, 1: Positive, 0: Negative
#print(onehot_encoded_sparse) 

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 1)	1.0
  (3, 0)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 1)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 0)	1.0
  (12, 0)	1.0
  (13, 1)	1.0
  (14, 1)	1.0
  (15, 0)	1.0
  (16, 1)	1.0
  (17, 1)	1.0
  (18, 1)	1.0
  (19, 0)	1.0
  (20, 0)	1.0
  (21, 0)	1.0
  (22, 0)	1.0
  (23, 0)	1.0
  (24, 1)	1.0
  :	:
  (703, 1)	1.0
  (704, 0)	1.0
  (705, 0)	1.0
  (706, 1)	1.0
  (707, 1)	1.0
  (708, 1)	1.0
  (709, 0)	1.0
  (710, 0)	1.0
  (711, 1)	1.0
  (712, 1)	1.0
  (713, 0)	1.0
  (714, 1)	1.0
  (715, 1)	1.0
  (716, 1)	1.0
  (717, 0)	1.0
  (718, 1)	1.0
  (719, 0)	1.0
  (720, 0)	1.0
  (721, 1)	1.0
  (722, 1)	1.0
  (723, 1)	1.0
  (724, 0)	1.0
  (725, 1)	1.0
  (726, 1)	1.0
  (727, 1)	1.0


In [45]:
'''
onehot_encoded_sparse = encoder.fit_transform(y_train.reshape(-1, 1))-- will return error! (list has not reshape attribute )
You need make y_train as 2D manually or need to do by numpy
Reshape available in Numpy, that why need to convert list into numpy
'''

'\nonehot_encoded_sparse = encoder.fit_transform(y_train.reshape(-1, 1))-- will return error! (list has not reshape attribute )\nYou need make y_train as 2D manually or need to do by numpy\nReshape available in Numpy, that why need to convert list into numpy\n'

## Count Vectorizer

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
cv = vectorizer.fit_transform(X_train)  # This vectorizer expect 1D list- reshaped not needed 

In [50]:
print(cv) # default it will shows as sparse matrix, but you can see by dense array form too, see below

  (0, 2112)	1
  (0, 1345)	1
  (0, 775)	1
  (0, 821)	1
  (1, 2206)	1
  (1, 1000)	1
  (1, 2118)	1
  (1, 1771)	1
  (1, 585)	1
  (1, 2096)	1
  (1, 1190)	1
  (1, 751)	1
  (1, 69)	1
  (1, 2204)	1
  (1, 744)	1
  (1, 1373)	1
  (2, 1706)	1
  (2, 2742)	1
  (2, 175)	1
  (2, 265)	1
  (2, 2528)	1
  (2, 2025)	1
  (2, 1338)	1
  (2, 1211)	1
  (2, 969)	1
  :	:
  (726, 962)	1
  (726, 2470)	1
  (726, 2017)	1
  (726, 2429)	1
  (726, 935)	1
  (726, 2733)	1
  (726, 1366)	1
  (726, 1718)	1
  (726, 2601)	1
  (726, 2134)	1
  (727, 585)	1
  (727, 2470)	1
  (727, 1104)	1
  (727, 1462)	1
  (727, 2358)	1
  (727, 946)	1
  (727, 872)	1
  (727, 235)	1
  (727, 793)	1
  (727, 348)	1
  (727, 2493)	1
  (727, 1001)	1
  (727, 59)	1
  (727, 576)	1
  (727, 2767)	1


In [51]:
# Showing by dense array
print(cv.toarray()) # This will return the dense represeantion

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [52]:
 print(vectorizer.get_feature_names_out()) # This shows the feature name(it will represnet unique feature from list/doc)

['0g' '10' '100000' ... '½re' '½s' '½tieï']


<h2 style="color:purple" align="left"> Working </h2>

In [39]:
# OneHotEncoder(sparse=False) : false mean the output will be in array form, False will show actual 0 & 1 
# OneHotEncoder(sparse=True) : It mean the output will be in matrix form or dense matrix form

#### OneHot

- It generally expect 2D array