# A beginners guide to sentiment Analysis

### This notebook contains binary classification of of text data by sentiment analysis. The data can be found [here](https://www.kaggle.com/c/nlp-getting-started/data).
### The idea behind this notebook is very simple, does not use fancy and complex models, cleans the data, encodes it in one hot vectors and trains a logistic regression model on it.


## The aim of this notebook is to predict if a tweet is of a fake disaster or a distress signal of a real disaster.

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df['target'].value_counts()


0    4342
1    3271
Name: target, dtype: int64

### The data is fairly balanced with 4342 examples of a fake disaster and 3271 examples of a real disaster

In [5]:
df.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
df.fillna('',inplace=True)

# Removing punctuation

In [7]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree


In [9]:
df['text']= df['text'].apply(lambda x:remove_punctuation(x))

# Lowering the case

In [10]:
df['text']= df['text'].apply(lambda x: x.lower())

# Tokenizing 

In [11]:

def tokenize(string):
    '''
    Tokenizes the string to a list of words
    '''
    tokens = string.split()
    return tokens


In [12]:
df['text']= df['text'].apply(lambda x: tokenize(x))

In [13]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,"[our, deeds, are, the, reason, of, this, earth...",1
1,4,,,"[forest, fire, near, la, ronge, sask, canada]",1
2,5,,,"[all, residents, asked, to, shelter, in, place...",1
3,6,,,"[13000, people, receive, wildfires, evacuation...",1
4,7,,,"[just, got, sent, this, photo, from, ruby, ala...",1


In [14]:
df['keyword']= df['keyword'].apply(lambda x: tokenize(x))

In [15]:
df.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,[],,"[two, giant, cranes, holding, a, bridge, colla...",1
7609,10870,[],,"[ariaahrary, thetawniest, the, out, of, contro...",1
7610,10871,[],,"[m194, 0104, utc5km, s, of, volcano, hawaii, h...",1
7611,10872,[],,"[police, investigating, after, an, ebike, coll...",1
7612,10873,[],,"[the, latest, more, homes, razed, by, northern...",1


In [16]:
df.drop(columns=['id'],inplace=True)

In [17]:
df.head()

Unnamed: 0,keyword,location,text,target
0,[],,"[our, deeds, are, the, reason, of, this, earth...",1
1,[],,"[forest, fire, near, la, ronge, sask, canada]",1
2,[],,"[all, residents, asked, to, shelter, in, place...",1
3,[],,"[13000, people, receive, wildfires, evacuation...",1
4,[],,"[just, got, sent, this, photo, from, ruby, ala...",1


# Removing stop words

In [18]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

In [19]:
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [20]:
df['text']= df['text'].apply(lambda x:remove_stopwords(x))

# Stemming

In [21]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

In [22]:
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text
df['text']=df['text'].apply(lambda x: stemming(x))

In [23]:
df['keyword']=df['keyword'].apply(lambda x: stemming(x))

# Lemmatization

In [108]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [109]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [26]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Manas
[nltk_data]     Vardhan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [110]:
df['text']=df['text'].apply(lambda x: lemmatizer(x))

# Vectorizing the data

### Building the vocabulary
### In another approach, we could have used the vocabulary, but in this approach, we do not use the vocabulary so if you want to reproduce this code, you can safely omit this section

In [111]:
vocab = []

'''
We add all the lists of tokenized strings to make one large list of words

Note ['a','b'] + ['c'] = ['a','b','c']

'''

for i in df['text'].values:
    vocab = vocab + i

print(len(vocab))


76378


In [112]:
# We make a set of the vocab words to remove multiple occurences of a same word, implying only unique words stay in set.

set_vocab = set(vocab)
vocab = list(set_vocab)
# we convert that set back to a list
print(len(vocab),type(vocab))

19511 <class 'list'>


## Vectorizing the text data

In [30]:
## Converting the tokens back to strings to feed it into Count Vectorizer

df['text_strings'] = df['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [31]:
df['text_strings'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3    13000 peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text_strings, dtype: object

# Obtaining word-embeddings using Word2Vec

In [113]:
from gensim.models import Word2Vec

word2vec = Word2Vec(df['text'], min_count=1)

# print(corpus)

In [114]:
words = word2vec.wv.index_to_key
print(len(words))

19511


In [115]:
# word2vec.wv['smoke']

## Averaging out embeddings for the sentence

In [170]:
import numpy as np
def embed(tokenized):
    emb = []
    for i in tokenized:
        emb.append(word2vec.wv[i])
    return emb
    
    

In [171]:
df['embedded'] = df['text'].apply(embed)

# Obtaining x_train and y_train

In [182]:
# x_train = X.toarray()
x_train = df['embedded']
## The text is now vectorized

In [184]:
import numpy as np

In [185]:
# x = []

# for i in x_train:
#     x.append(i)

In [186]:
y_train = df['target']

In [187]:
y_train.shape

(7613,)

# Fitting a model

In [188]:
from sklearn.linear_model import LogisticRegression

In [189]:
clf = LogisticRegression(random_state=42)

In [190]:
clf.fit(x_train,y_train)

ValueError: setting an array element with a sequence.

In [147]:
pred = clf.predict(x_train)

In [148]:
from sklearn.metrics import accuracy_score

In [149]:
accuracy_score(y_train, pred)

0.5811112570602917

# Testing the model on test set

## Preprocessing the test set

In [104]:
df_test =pd.read_csv('test.csv')

In [105]:
df_test.fillna('',inplace=True)

In [107]:
df_test.drop(columns=['id','keyword','location'],inplace=True)

In [108]:
df_test['text']= df_test['text'].apply(lambda x:remove_punctuation(x))
df_test['text']= df_test['text'].apply(lambda x: tokenize(x))
df_test['text']= df_test['text'].apply(lambda x:remove_stopwords(x))
df_test['text']= df_test['text'].apply(lambda x: stemming(x))

In [109]:
df_test['text_strings'] = df_test['text'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [110]:
x_test = vectorizer.transform(df_test['text_strings'])

In [111]:
x_test = x_test.toarray()

In [112]:
x_test = np.array(x_test)

In [114]:
y_test_pred = clf.predict(x_test)

In [116]:
y_test_pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

# Putting the predictions to test csv

In [117]:
import pandas as pd

In [118]:
submission = pd.read_csv('test.csv')

In [121]:
submission['target'] = y_test_pred

In [122]:
submission.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",1
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,1
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [124]:
final_submission = submission[['id','target']]

In [125]:
final_submission.to_csv('final_submission.csv')

# Final Comments

### Having created this notebook from scratch, starting afresh in the field of NLP,it would be invaluable to have you comment on this notebook to tell me what could have I done better, what I did right and what must never be done. Do comment ;)