# IMPLEMENTATION OF BASIC CONCEPTS OF NATURAL LANGUAGE PROCESSING USING SKLEARN

## 1. Importing Datasets

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS



## 2. Tokenization

In [3]:
sentence = ['sklearn is a library of machine learning',
           'This shows tokenization using sklearn']
vector = CountVectorizer()
X = vector.fit_transform(sentence)
print(vector.get_feature_names())

['is', 'learning', 'library', 'machine', 'of', 'shows', 'sklearn', 'this', 'tokenization', 'using']


In [4]:
 print(X.toarray())

[[1 1 1 1 1 0 1 0 0 0]
 [0 0 0 0 0 1 1 1 1 1]]


## 3. Preprocessing

In [5]:
ENGLISH_STOP_WORDS = set( stopwords.words('english') ).union( set(ENGLISH_STOP_WORDS) )

In [6]:
ENGLISH_STOP_WORDS

{'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'ain',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'aren',
 "aren't",
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'bill',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'cant',
 'co',
 'con',
 'could',
 'couldn',
 "couldn't",
 'couldnt',
 'cry',
 'd',
 'de',
 'describe',
 'detail',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'done',
 'down',
 'due',
 'during',
 'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'etc',
 'even',
 'ever',
 

In [7]:
#REMOVING stop words from sentences

data = 'Google LLC is an American multinational technology company that specializes in Internet-related services and products'
tokens = word_tokenize(data) 
after = [w for w in tokens if not w in ENGLISH_STOP_WORDS] 
after = []

for w in tokens: 
    if w not in ENGLISH_STOP_WORDS: 
        after.append(w) 

In [8]:
after

['Google',
 'LLC',
 'American',
 'multinational',
 'technology',
 'company',
 'specializes',
 'Internet-related',
 'services',
 'products']

## SENTIMENT ANALYSIS USING SKLEARN

In [9]:
data = []
data_labels = []
with open("pos_tweets.txt") as f:
    for i in f: 
        data.append(i) 
        data_labels.append('pos')

with open("neg_tweets.txt",encoding="utf8") as f:
    for i in f: 
        data.append(i)
        data_labels.append('neg')

In [10]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
)
features = vectorizer.fit_transform(
    data
)
features_nd = features.toarray() # for easy usage

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [12]:
X_train, X_test, y_train, y_test  = train_test_split(features_nd, data_labels, train_size=0.80, random_state=1234)

In [13]:
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)

In [14]:
import random
j = random.randint(0,len(X_test)-7)
for i in range(j,j+7):
    print(y_pred[0])
    ind = features_nd.tolist().index(X_test[i].tolist())
    print(data[ind].strip())

neg
"@batistini21 lols you go for the cavs. orlando owned them today "
neg
"@rhonda_brown thanks for the tip on Sam's Club! "
neg
"ouchyyy booo  headache! going to beed right now. even when its 7:42 oouuuch"
neg
"SissyDawnie: @CokieTheCat - Marvelous on 10 cancer-free years!!!!! YAY!! *** Thanks! Yeah. We're all pretty psyched about that! "
neg
"Finally almost home "
neg
"....If i am going warsal that means no church in the morning if i dont go church but i think it imporant to see my nan coz she go cancer "
neg
â@RubyRose1 awww wish i could go! but its in sydney "


In [15]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.800498753117207


## TF-IDF Vectorization 

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
sent = ['Scikit Learn is one of the most advanced and powerful library of Machine Learning using Python. Scikit Learn gives us a practical access to a lot of Machine Learning Algorithms']

In [18]:
vectors = TfidfVectorizer()
Y = vectors.fit_transform(sent)
print(vectors.get_feature_names())

['access', 'advanced', 'algorithms', 'and', 'gives', 'is', 'learn', 'learning', 'library', 'lot', 'machine', 'most', 'of', 'one', 'powerful', 'practical', 'python', 'scikit', 'the', 'to', 'us', 'using']


In [19]:
print(Y.shape)

(1, 22)
