# IMPLEMENTATION OF BASIC CONCEPTS OF NATURAL LANGUAGE PROCESSING USING SKLEARN

1. Importing Datasets

In [10]:
from sklearn.datasets import fetch_openml

In [11]:
mice = fetch_openml(name='miceprotein', version=4)

In [13]:
mice.target

array(['c-CS-m', 'c-CS-m', 'c-CS-m', ..., 't-SC-s', 't-SC-s', 't-SC-s'],
      dtype=object)

2. Tokenization

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
sentence = ['sklearn is a library of machine learning',
           'This shows tokenization using sklearn']
vector = CountVectorizer()
X = vector.fit_transform(sentence)
print(vector.get_feature_names())

['is', 'learning', 'library', 'machine', 'of', 'shows', 'sklearn', 'this', 'tokenization', 'using']


In [16]:
 print(X.toarray())

[[1 1 1 1 1 0 1 0 0 0]
 [0 0 0 0 0 1 1 1 1 1]]


3. Preprocessing

In [17]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
ENGLISH_STOP_WORDS = set( stopwords.words('english') ).union( set(ENGLISH_STOP_WORDS) )

In [20]:
ENGLISH_STOP_WORDS

{'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'ain',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'aren',
 "aren't",
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'bill',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'cant',
 'co',
 'con',
 'could',
 'couldn',
 "couldn't",
 'couldnt',
 'cry',
 'd',
 'de',
 'describe',
 'detail',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'done',
 'down',
 'due',
 'during',
 'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'etc',
 'even',
 'ever',
 

In [22]:
#REMOVING stop words from sentences

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

data = 'Google LLC is an American multinational technology company that specializes in Internet-related services and products'
tokens = word_tokenize(data) 
after = [w for w in tokens if not w in ENGLISH_STOP_WORDS] 
after = []

for w in tokens: 
    if w not in ENGLISH_STOP_WORDS: 
        after.append(w) 
        



In [23]:
after

['Google',
 'LLC',
 'American',
 'multinational',
 'technology',
 'company',
 'specializes',
 'Internet-related',
 'services',
 'products']


SENTIMENT ANALYSIS USING SKLEARN

In [38]:
from sklearn.feature_extraction.text import CountVectorizer


In [41]:
data = []
data_labels = []
with open("P:/NLP/pos_tweets.txt") as f:
    for i in f: 
        data.append(i) 
        data_labels.append('pos')

with open("P:/NLP/neg_tweets.txt",encoding="utf8") as f:
    for i in f: 
        data.append(i)
        data_labels.append('neg')

In [42]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
)
features = vectorizer.fit_transform(
    data
)
features_nd = features.toarray() # for easy usage

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, 
        random_state=1234)

In [44]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [45]:
log_model = log_model.fit(X=X_train, y=y_train)



In [46]:
y_pred = log_model.predict(X_test)

In [47]:
import random
j = random.randint(0,len(X_test)-7)
for i in range(j,j+7):
    print(y_pred[0])
    ind = features_nd.tolist().index(X_test[i].tolist())
    print(data[ind].strip())

neg
"@Akivafever  Sorry, little brother."
neg
"@Steve_Buscemi the weather in Canada is freezing "
neg
"@MusicSnob75 I want starbucks and have no time to stop before work "
neg
"My little man has the fever virus   "
neg
"can't go on facebook because, for the first time in 5 1/2 years, someone change my password!!!!! Oh well... I'll explore other apps now "
neg
"Put vacation photos online a few yrs ago. PC crashed, and now I forget the name of the site. "
neg
"right about now i wish the psp had 2 shoulder buttons "


In [48]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.800498753117207


TF-IDF Vectorization 

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
sent = ['Scikit Learn is one of the most advanced and powerful library of Machine Learning using Python. Scikit Learn gives us a practical access to a lot of Machine Learning Algorithms']

In [58]:
vectors = TfidfVectorizer()
Y = vectors.fit_transform(sent)
print(vectors.get_feature_names())

['access', 'advanced', 'algorithms', 'and', 'gives', 'is', 'learn', 'learning', 'library', 'lot', 'machine', 'most', 'of', 'one', 'powerful', 'practical', 'python', 'scikit', 'the', 'to', 'us', 'using']


In [59]:
print(Y.shape)

(1, 22)
