## Pipeline 

* Load Dataset 
* Clean the Dataset
* Text Pre-Processing
* Text Vectorization
* Build and Train Model
* Plot Classification Report

### Import Library

In [1]:
# Import Library

import re
import nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

### Load Dataset

In [2]:
# Load Dataset

def load_dataset(file_name):
    
    df = pd.read_csv(file_name, encoding= 'unicode_escape')
    
    df = df[["text", "sentiment"]]
    df.drop_duplicates(inplace = True)
    df.dropna(inplace = True)
    
    return df

train_df = load_dataset("Dataset/Ex 2/train.csv")
test_df = load_dataset("Dataset/Ex 2/test.csv")

print("Train Shape :", train_df.shape)
print("Test Shape  :", test_df.shape)

train_df.head()

Train Shape : (27480, 2)
Test Shape  : (3534, 2)


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [3]:
# Y Label

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_y = encoder.fit_transform(train_df["sentiment"])
test_y = encoder.transform(test_df["sentiment"])

# train_y = np.array(pd.get_dummies(train_df["sentiment"])[["positive", "neutral", "negative"]])
# test_y  = np.array(pd.get_dummies(test_df["sentiment"])[["positive", "neutral", "negative"]])

### Text - Preprocessing

In [4]:
# Tweet Preprocessing

def pre_processing(tweet: str):
    
    # Remove Leading Blank Spaces
    tweet = tweet.strip()
    
    # Lower Case
    tweet = tweet.lower()
    
    # Remove URLS 
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    tweet = re.sub(url_pattern, "", tweet)
    
    # Remove UserName
    username_pattern = re.compile(r"@\w+")
    tweet = re.sub(username_pattern, "", tweet)
    
    # Remove Hashtags
    hashtag_pattern = re.compile(r"#\w+")
    tweet = re.sub(hashtag_pattern, "", tweet)
    
    # Character normalization // todaaaaay -> today
    tweet = re.sub(r"([a-zA-Z])\1{2,}", r'\1', tweet)
    
    # Remove Special Characters
    tweet = re.sub(r'[^a-zA-Z\s]', "", tweet)
    
    # Word Tokenizer
    tweet = nltk.word_tokenize(tweet)
    
#     # Remove Stop Words 
#     stop_words = set([re.sub(r'[^a-zA-Z\s]', "", word) for word in nltk.corpus.stopwords.words("english")])
#     tweet = [word for word in tweet if word not in stop_words]
    
    # lemmatization
    def get_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
        return tag_dict.get(tag, "n")
    
    lemma = nltk.stem.WordNetLemmatizer()
    tweet = [lemma.lemmatize(word, pos=get_pos(word)) for word in tweet]
    
    return tweet

train_df["pre-tweet"] = train_df["text"].apply(pre_processing)
test_df["pre-tweet"] = test_df["text"].apply(pre_processing)

train_df["documents"] = train_df["pre-tweet"].apply(lambda x : " ".join(x))
test_df["documents"] = test_df["pre-tweet"].apply(lambda x : " ".join(x))

pre_processing("I loveeeee NLP, @rahul_appu, www.rahul_appu.com, #NLP ")

['i', 'love', 'nlp']

### Vocab

In [5]:
vocab = set()

for words in train_df["pre-tweet"]:
    for word in words:
        vocab.add(word)
        
print("Vocab Size :", len(vocab))

Vocab Size : 22037


### Vectorization

#### Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = CountVectorizer()

train_bow = bag_of_words.fit_transform(train_df["documents"])
test_bow = bag_of_words.transform(test_df["documents"])

In [7]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_bow, train_y)


from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_bow)
print("Accuracy Score :", accuracy_score(test_y, predict), end='\n\n')
print(classification_report(y_true = test_y, y_pred = predict))

Accuracy Score : 0.7023203169213356

              precision    recall  f1-score   support

           0       0.71      0.64      0.67      1001
           1       0.64      0.73      0.68      1430
           2       0.79      0.72      0.75      1103

    accuracy                           0.70      3534
   macro avg       0.71      0.70      0.70      3534
weighted avg       0.71      0.70      0.70      3534



#### TF - IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()

train_idf = tf_idf.fit_transform(train_df["documents"])
test_idf = tf_idf.transform(test_df["documents"])

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_idf, train_y)

from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_idf)
print("Accuracy Score :", accuracy_score(test_y, predict), end='\n\n')
print(classification_report(y_true = test_y, y_pred = predict))

Accuracy Score : 0.7105263157894737

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      1001
           1       0.64      0.75      0.69      1430
           2       0.81      0.71      0.76      1103

    accuracy                           0.71      3534
   macro avg       0.73      0.70      0.71      3534
weighted avg       0.72      0.71      0.71      3534



#### Continuous Bag of Words

In [10]:
from gensim.models import Word2Vec

g_model = Word2Vec(sentences=train_df["pre-tweet"], vector_size=200, window=5, workers=5, epochs=500)

In [11]:
def in_vocab(word_l):
    for word in word_l:
        if word not in g_model.wv:
            return False
    else:
        return True

train_vec = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in train_df["pre-tweet"]]
test_vec  = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in test_df["pre-tweet"]]

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_vec, train_y)

from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_vec)
print("Accuracy Score :", accuracy_score(test_y, predict), end='\n\n')
print(classification_report(y_true = test_y, y_pred = predict))

Accuracy Score : 0.5195246179966044

              precision    recall  f1-score   support

           0       0.65      0.29      0.40      1001
           1       0.46      0.86      0.60      1430
           2       0.73      0.29      0.42      1103

    accuracy                           0.52      3534
   macro avg       0.62      0.48      0.47      3534
weighted avg       0.60      0.52      0.49      3534



#### Skip gram

In [13]:
from gensim.models import Word2Vec

g_model = Word2Vec(sentences=train_df["pre-tweet"], vector_size=200, window=5, workers=5, sg=1, epochs=500)

In [14]:
def in_vocab(word_l):
    for word in word_l:
        if word not in g_model.wv:
            return False
    else:
        return True

train_vec = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in train_df["pre-tweet"]]
test_vec  = [g_model.wv[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((200)) for x in test_df["pre-tweet"]]

In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_vec, train_y)

from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_vec)
print("Accuracy Score :", accuracy_score(test_y, predict), end='\n\n')
print(classification_report(y_true = test_y, y_pred = predict))

Accuracy Score : 0.5172608941709111

              precision    recall  f1-score   support

           0       0.67      0.29      0.40      1001
           1       0.46      0.86      0.60      1430
           2       0.71      0.29      0.41      1103

    accuracy                           0.52      3534
   macro avg       0.61      0.48      0.47      3534
weighted avg       0.60      0.52      0.48      3534



#### Word2Vec - Twitter Glove

In [16]:
import gensim.downloader as api

model = api.load("glove-twitter-200")

shape_n = 200

def in_vocab(word_l):
    for word in word_l:
        if word not in model:
            return False
    else:
        return True

train_vec = [model[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((shape_n)) for x in train_df["pre-tweet"]]
test_vec  = [model[x].sum(axis = 0) if len(x) and in_vocab(x) else np.zeros((shape_n)) for x in test_df["pre-tweet"]]

In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_vec, train_y)

from sklearn.metrics import classification_report, accuracy_score

predict = model.predict(test_vec)
print("Accuracy Score :", accuracy_score(test_y, predict), end='\n\n')
print(classification_report(y_true = test_y, y_pred = predict))

Accuracy Score : 0.642331635540464

              precision    recall  f1-score   support

           0       0.72      0.56      0.63      1001
           1       0.57      0.73      0.64      1430
           2       0.73      0.60      0.66      1103

    accuracy                           0.64      3534
   macro avg       0.67      0.63      0.64      3534
weighted avg       0.66      0.64      0.64      3534



### Classification : TF - IDF

In [18]:
text = """What is not to like about this product.
Not bad.
Not an issue.
Not buggy.
Not happy.
Not user-friendly.
Not good.
Is it any good?
I do not dislike horror movies. 
Disliking horror movies is not uncommon. 
Sometimes I really hate the show. 
I love having to wait two months for the next series to come out! 
The final episode was surprising with a terrible twist at the end.
The film was easy to watch but I would not recommend it to my friends. 
I LOL’d at the end of the cake scene."""

input_text = text.split("\n")
input_text = [" ".join(pre_processing(string)) for string in input_text]

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer()

train_idf = tf_idf.fit_transform(train_df["documents"])
pred_idf = tf_idf.transform(input_text)

In [20]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(train_idf, train_y)

predict = model.predict(pred_idf)
predict = encoder.inverse_transform(predict)

In [21]:
for index, text in enumerate(text.split("\n")):
    print(text, " : ", predict[index])

What is not to like about this product.  :  negative
Not bad.  :  negative
Not an issue.  :  negative
Not buggy.  :  neutral
Not happy.  :  positive
Not user-friendly.  :  negative
Not good.  :  positive
Is it any good?  :  positive
I do not dislike horror movies.   :  negative
Disliking horror movies is not uncommon.   :  negative
Sometimes I really hate the show.   :  negative
I love having to wait two months for the next series to come out!   :  positive
The final episode was surprising with a terrible twist at the end.  :  neutral
The film was easy to watch but I would not recommend it to my friends.   :  neutral
I LOL’d at the end of the cake scene.  :  neutral
