## Problem Statement

Sentiment analysis remains one of the key problems that has seen extensive application of natural language processing. This time around, given the tweets from customers about various tech firms who manufacture and sell mobiles, computers, laptops, etc, the task is to identify if the tweets have a negative sentiment towards such companies or products.

![](https://datahack-prod.s3.ap-south-1.amazonaws.com/__sized__/contest_cover/sentiments_1920x480-thumbnail-1200x1200-90.jpg)

Reference Link : https://datahack.analyticsvidhya.com/contest/linguipedia-codefest-natural-language-processing-1/#About

In [1]:
## Import necessary libraries.

import numpy as np ## Numpy library for creating and modifying arrays.
import pandas as pd ## Pandas library for reading '.csv' files as dataframes.
from nltk.tokenize import sent_tokenize, word_tokenize ## For sentence,word tokenizing.
import re ## For regular expressions.
import string ## For punctuations.
from nltk.corpus import stopwords ## For stop words
from nltk.stem.porter import PorterStemmer ## For getting root word.
from spellchecker import SpellChecker ## For checking spelling of a word.
from sklearn.model_selection import train_test_split ## For splitting data into train and validation.
## from sklearn.feature_extraction.text import TfidfTransformer ## Converting text into numeric() 
from sklearn.feature_extraction.text import TfidfVectorizer ## For converting text into tfidf vector(numeric array).
import os ## For connecting to local machine to set path for reading files.
from sklearn.naive_bayes import MultinomialNB ## For Naive bayes model.
from time import time ## To get the processing time value. 
from sklearn.preprocessing import LabelEncoder ## For encoding the labels.
from sklearn.metrics import accuracy_score ## For getting accuracy value.
from sklearn.metrics import confusion_matrix,classification_report ## For confusion matrix and TNR,TPR,accuracy.
from sklearn.model_selection import GridSearchCV ## For grid search
from sklearn.pipeline import Pipeline ## For pipe line(to execute stpes sequecntially).

In [2]:
## Get current working directory.
os.getcwd()

'D:\\Python\\Pratice\\Identify the Sentiments'

In [3]:
## Set working directory.
os.chdir("D:\DataScience\Pratice\Identify the Sentiments")
os.getcwd()

'D:\\DataScience\\Pratice\\Identify the Sentiments'

In [4]:
## Load train and test data.
train = pd.read_csv('train.csv',header='infer',sep=',',encoding='latin-1')
test = pd.read_csv('test.csv',header='infer',sep=',',encoding='latin-1')

In [6]:
## Check dimensions of train and test data.
print(train.shape)
print(test.shape)

(7920, 3)
(1953, 2)


In [7]:
## Check first record of train data.
train.head(1)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...


In [8]:
## Check last record of train data.
train.tail(1)

Unnamed: 0,id,label,tweet
7919,7920,0,Apple Barcelona!!! #Apple #Store #BCN #Barcelo...


In [9]:
## Check first record of test data.
test.head(1)

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...


In [10]:
## Check last record of test data.
test.tail(1)

Unnamed: 0,id,tweet
1952,9873,Finally I got it .. thanx my father .. #Samsun...


In [11]:
## Get summary statistics of train data.
train.describe(include='all')

Unnamed: 0,id,label,tweet
count,7920.0,7920.0,7920
unique,,,7918
top,,,@architecture_3design - TAG YOUR FRIENDS @arch...
freq,,,3
mean,3960.5,0.255808,
std,2286.451399,0.436342,
min,1.0,0.0,
25%,1980.75,0.0,
50%,3960.5,0.0,
75%,5940.25,1.0,


In [12]:
## Get summarry statistics of test data.
test.describe(include='all')

Unnamed: 0,id,tweet
count,1953.0,1953
unique,,1953
top,,Photo: a fake b!tch #thatstheshitidontlike #se...
freq,,1
mean,8897.0,
std,563.926857,
min,7921.0,
25%,8409.0,
50%,8897.0,
75%,9385.0,


In [13]:
## Get train data column names.
train.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [14]:
## Get test data column names.
test.columns

Index(['id', 'tweet'], dtype='object')

In [15]:
## Get index range for train data.
train.index

RangeIndex(start=0, stop=7920, step=1)

In [16]:
## Get index range for test data.
test.index

RangeIndex(start=0, stop=1953, step=1)

In [17]:
## Get data types for train data columns.
train.dtypes

id        int64
label     int64
tweet    object
dtype: object

In [18]:
## Get data types for test data columns.
test.dtypes

id        int64
tweet    object
dtype: object

In [19]:
## Check null values for train data.
train.isna().sum()

id       0
label    0
tweet    0
dtype: int64

In [20]:
## Check null  values for test data.
test.isnull().sum()

id       0
tweet    0
dtype: int64

In [21]:
## Below method returns data types,levels,NA values,unique values for the given data frame.

def getStatistics(df):
    return pd.DataFrame({'dtypes' : df.dtypes,
                         'levels' : [df[column].unique() for column in df.columns],
                         'NA  Values' : df.isna().sum(),
                         'Unique Values' :  df.nunique()
                        })

In [22]:
## Get data types,levels,NA values,unique values for train data.
getStatistics(train)

Unnamed: 0,dtypes,levels,NA Values,Unique Values
id,int64,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",0,7920
label,int64,"[0, 1]",0,2
tweet,object,[#fingerprint #Pregnancy Test https://goo.gl/h...,0,7918


In [23]:
## Get data types,levels,NA values,unique values for test data.
getStatistics(test)

Unnamed: 0,dtypes,levels,NA Values,Unique Values
id,int64,"[7921, 7922, 7923, 7924, 7925, 7926, 7927, 792...",0,1953
tweet,object,[I hate the new #iphone upgrade. Won't let me ...,0,1953


In [24]:
## Remove URL from text.
def remove_url(text):
    url =  re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [25]:
## Remove html from text.
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

In [26]:
## Remove Emojis.
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [27]:
## Print punctuations from string class.
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [28]:
## Remove punctuatons.
def remove_punctuation(text):
    ## prepare a translation table to replace punctations with empty space.
    translator = str.maketrans('','',string.punctuation)
    ## replace punctations with empty space.
    return text.translate(translator)
    

In [29]:
#a = "wow! this works. "
#re.findall("\w+",a)

In [30]:
## Define a list of punctuation marks.
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

In [31]:
## Replace punctuation marks with whitespace. 
def remove_special_characters(text):
    text = str(text)
    for punct in puncts:
        if punct in text:
            text = text.replace(punct, '')
    return text

In [32]:
## Test remove_special_characters function.
remove_special_characters('"a"bcd "hfnj" fje')

'abcd hfnj fje'

In [34]:
## Word tokenization.
def word_tokenization(text):
    return [w.lower() for w in word_tokenize(text)]

In [35]:
## Remove stop words. 
def remove_stopWords(text):
    sw = stopwords.words('english')
    ## get the words which are not there in stop words and convert them into lower case
    return [word.lower() for word in text if word.lower() not in sw]

In [36]:
## Stem/get root words for given text.
def stemProcess(text):
    ## instantiate PoterStemmer
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in text]

In [37]:
## install python spell check library (If you want install again then remove # in below statement)
# !pip install pyspellchecker

## Correct the spelling for the given text.
def correct_spell(text):
    ## instantiate spell checker
    spell = SpellChecker()
    correct_words = []
    misspell_words = spell.unknown(text)
    for word in text:
        if word in misspell_words:
            correct_words.append(spell.correction(word))
        else:
            correct_words.append(word)
    return correct_words

In [38]:
## Remove stop words,url,html,emoji,punctuation and do stemming on Train data.
train['tweet'] = train['tweet'].apply(remove_url)
train['tweet'] = train['tweet'].apply(remove_html)
train['tweet'] = train['tweet'].apply(remove_special_characters)
train['tweet'] = train['tweet'].apply(word_tokenize)
train['tweet'] = train['tweet'].apply(remove_stopWords)
train['tweet'] = train['tweet'].apply(stemProcess)
#train['tweet'] = train['tweet'].apply(correct_spell) ## checking spelling is taking too much time and accuracy wise, i didn't get much difference.

In [39]:
## Remove stop words,url,html,emoji,punctuation and do stemming on Test data.
test['tweet'] = test['tweet'].apply(remove_url)
test['tweet'] = test['tweet'].apply(remove_html)
test['tweet'] = test['tweet'].apply(remove_special_characters)
test['tweet'] = test['tweet'].apply(word_tokenize)
test['tweet'] = test['tweet'].apply(remove_stopWords)
test['tweet'] = test['tweet'].apply(stemProcess)
#test['tweet'] = test['tweet'].apply(correct_spell)

In [40]:
## Get train data column names.
train.columns

Index(['id', 'label', 'tweet'], dtype='object')

In [41]:
## Set Index to train and test data.
train.set_index('id',inplace=True)
test.set_index('id',inplace=True)

In [42]:
## Get first record of train data.
train.head(1)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,"[fingerprint, pregnanc, test, android, app, be..."


In [43]:
## Get first record of test data.
test.head(1)

Unnamed: 0_level_0,tweet
id,Unnamed: 1_level_1
7921,"[hate, new, iphon, upgrad, wont, let, download..."


In [44]:
## Split data into train and validation (80:20 format).

train_text,valid_text,\
train_target,valid_target= train_test_split(train.drop('label',axis=1),train.drop('tweet',axis=1),
                                          test_size=0.2,random_state=1234)

In [45]:
## Get first record of train data.
train_text.head(1)

Unnamed: 0_level_0,tweet
id,Unnamed: 1_level_1
7319,"[ios8, brake, phone, appl, wont, except, seria..."


In [47]:
## Check dimesions of train data.
train_text.shape

(6336, 1)

In [48]:
## Check first record of tran traget data.
train_target.head(1)

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
7319,1


In [49]:
## Check first record of validation data.
valid_text.head(1)

Unnamed: 0_level_0,tweet
id,Unnamed: 1_level_1
3306,"[moment, go, contact, isnt, number, iphon, updat]"


In [50]:
## Check first record of validation target data.
valid_target.head(1)

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
3306,1


In [51]:
## Convert list into string.
def convertListToString(temp):
    temp1 =[]
    for i in temp:
        temp1.append(i)
    return " ".join(temp1)

In [52]:
## Check first 2 records of tweet column of train data.
train_text['tweet'].head(2)

id
7319    [ios8, brake, phone, appl, wont, except, seria...
1925    [chorizo, appl, sausag, roll, made, nice, past...
Name: tweet, dtype: object

In [53]:
## Convert train tweet column data from list to string.
train_text['tweet'] = train_text['tweet'].apply(convertListToString)

In [54]:
## Convert validation tweet column data from list to string.
valid_text['tweet'] = valid_text['tweet'].apply(convertListToString)

In [55]:
## Convert test tweet column data from list to string.
test['tweet'] = test['tweet'].apply(convertListToString)

In [57]:
## Instantiate TfidfVectorizer() with 1,2 grams.
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
#max_df=0.95, min_df=2, stop_words='english' #USE HELP TO SEE WHAT EACH DOES)

## Get the start time.
t0 = time()

## Tokenize and build vocabulary for train,validation,test data.
train_data  = tfidf_vectorizer.fit_transform(train_text['tweet'])
validation_data = tfidf_vectorizer.transform(valid_text['tweet'])
test_data =tfidf_vectorizer.transform(test['tweet'])

## Print the process taken time.
print("done in %0.3fs." % (time() - t0))

done in 0.782s.


In [None]:
## Get vocabulary and idf values for tfidf vector.
print(tfidf_vectorizer.vocabulary_)
print(tfidf_vectorizer.idf_)

In [59]:
## Instantiate label encoder.
le_label = LabelEncoder()

In [60]:
## Do label encoding for train traget column data.
train_target = le_label.fit_transform(train_target)

  y = column_or_1d(y, warn=True)


In [61]:
## Do label encoding for validation traget column data.
valid_target = le_label.transform(valid_target)

  y = column_or_1d(y, warn=True)


In [62]:
## Print dimesnions of train features,target columns data.
print(train_data.shape)
print(train_target.shape)

(6336, 60941)
(6336,)


In [91]:
## Instantiate Naive bayes and fit a model.
naive_bayes = MultinomialNB()
naive_bayes.fit(train_data, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
## Get prediction on train and validation data.
predict_train = naive_bayes.predict(train_data)
predict_validation = naive_bayes.predict(validation_data)

In [66]:
## Display accuracy value for train data.
print("Train Accuracy :",accuracy_score(train_target,predict_train))

Train Accuracy : 0.9455492424242424


In [67]:
## Display  accuracy value for validation data.
print("Validation Accuracy :",accuracy_score(valid_target,predict_validation))

Validation Accuracy : 0.8125


In [69]:
## Get confusion matrix for train data and display it.
confusion_matrix_train = confusion_matrix(train_target,predict_train)
print(confusion_matrix_train)

[[4704   10]
 [ 335 1287]]


In [70]:
## Get confusion matrix for validation data and display it.
confusion_matrix_validation = confusion_matrix(valid_target, predict_validation)
print(confusion_matrix_validation)

[[1163   17]
 [ 280  124]]


In [71]:
Accuracy_Train=(confusion_matrix_train[0,0]+confusion_matrix_train[1,1])/(confusion_matrix_train[0,0]+confusion_matrix_train[0,1]+confusion_matrix_train[1,0]+confusion_matrix_train[1,1])
TNR_Train= confusion_matrix_train[0,0]/(confusion_matrix_train[0,0]+confusion_matrix_train[0,1])
TPR_Train= confusion_matrix_train[1,1]/(confusion_matrix_train[1,0]+confusion_matrix_train[1,1])

print("Train TNR: ",TNR_Train)
print("\n")
print("Train TPR: ",TPR_Train)
print("\n")
print("Train Accuracy: ",Accuracy_Train)

Train TNR:  0.9978786593126856


Train TPR:  0.7934648581997534


Train Accuracy:  0.9455492424242424


In [72]:
Accuracy_Test=(confusion_matrix_validation[0,0]+confusion_matrix_validation[1,1])/(confusion_matrix_validation[0,0]+confusion_matrix_validation[0,1]+confusion_matrix_validation[1,0]+confusion_matrix_validation[1,1])
TNR_Test= confusion_matrix_validation[0,0]/(confusion_matrix_validation[0,0] +confusion_matrix_validation[0,1])
TPR_Test= confusion_matrix_validation[1,1]/(confusion_matrix_validation[1,0] +confusion_matrix_validation[1,1])

print("Validation TNR: ",TNR_Test)
print("\n")
print("Validation TPR: ",TPR_Test)
print("\n")
print("Validation Accuracy: ",Accuracy_Test)

Validation TNR:  0.985593220338983


Validation TPR:  0.3069306930693069


Validation Accuracy:  0.8125


In [75]:
## Copy test data ino temp.
temp = test.copy()

In [76]:
## Get the predictions on test data.
y_pred = naive_bayes.predict(temp)

In [77]:
## Get original values of preidctions.
temp['label'] = le_label.inverse_transform(y_pred)

In [78]:
## Reset index.
temp.reset_index(inplace=True)

In [79]:
## Copy id,label columns from temp to to_submit.
to_submit = temp[['id', 'label']]

In [80]:
## Get the value count for label column.
to_submit.label.value_counts()

0    1768
1     185
Name: label, dtype: int64

In [81]:
## Print dimensions of t-_submit,test data.
print(to_submit.shape)
print(test.shape)

(1953, 2)
(1953, 1)


In [82]:
## store to_submit into csv file with name NaiveBayes.
to_submit.to_csv('NaiveBayes.csv',index = False)

In [None]:
## with Grid Search Naive Bayes Model.

In [113]:
## Prepare parameters dictionary for grid search. 
param_grid = [{'vect__ngram_range': [(1, 1), (1, 2)],
               'vect__use_idf': [True, False],
               'vect__norm': ['l1', 'l2'],
               'nb_clf__alpha': [1, 1e-1]}]

In [114]:
## Add tifidf vectorizer,naive bayes models to Pipeline.
nb_tfidf = Pipeline([('vect', tfidf_vectorizer), 
                     ('nb_clf', MultinomialNB())])

In [115]:
## Build grid search model.
gs_nb_tfidf = GridSearchCV(nb_tfidf,  ## Pipeline to execute tfidf,naive bayes.
                           param_grid, ## Parameters dictionary.
                           scoring='accuracy', ## Metric on which we want to calculate.
                           cv=5,  ### Number of cross folds.
                           verbose=2, ## To display each process steps.
                           n_jobs=1) ## To use all processors.

In [116]:
## Fit a model.
gs_nb_tfidf.fit(train_text['tweet'], train_target)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.2s
[CV] nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True 
[CV]  nb_clf__alpha=1, vect__ngram_range=(1, 1), vect__norm=l1, vect__use_idf=True, total=   0.

[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  2.1min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                                        n

In [117]:
## Display best parameters.
print('Best parameter set: %s ' % gs_nb_tfidf.best_params_)

Best parameter set: {'nb_clf__alpha': 0.1, 'vect__ngram_range': (1, 2), 'vect__norm': 'l2', 'vect__use_idf': False} 


In [118]:
## Get accuray for train data.
print('CV Accuracy: %.3f' % gs_nb_tfidf.best_score_)

CV Accuracy: 0.894


In [None]:
## Get best parameters.
nb_clf = gs_nb_tfidf.best_estimator_

In [119]:
## Get and display accuracy for validation data.
print('Validation Accuracy: %.3f' % nb_clf.score(valid_text['tweet'], valid_target))

Validation Accuracy: 0.885


In [120]:
## Get predictions for test data.
test_predictions = gs_nb_tfidf.predict(test['tweet'])

In [121]:
## Display test prediction values.
test_predictions

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [123]:
## Get original values of test predictions.
test['label'] = le_label.inverse_transform(test_predictions)

In [124]:
## Reset index for test data.
test.reset_index(inplace=True)

In [125]:
## Copy id,label columns data from test to to_submit.
to_submit = test[['id', 'label']]

In [126]:
## Get value counts for label column.
to_submit.label.value_counts()

0    1451
1     502
Name: label, dtype: int64

In [127]:
## Display to_submit,test dimesnions.
print(to_submit.shape)
print(test.shape)

(1953, 2)
(1953, 1)


In [128]:
## Store to_submit into csv file with name NaiveBayes_GridSearch. 
to_submit.to_csv('NaiveBayes_GridSearch.csv',index = False)