# Natural Language Processing Workshop Ex 3

## Building a Model and Evaluating a Model

In [1]:
############### IMPORTS ###############
# Import the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# NLTK Modules
import re
from nltk.tokenize import TweetTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from nltk import pos_tag

from wordcloud import WordCloud

#training Modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import MultinomialNB

#Evaluation Modules
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
############### LOADING DATASET ###############
#reading the dataset files
train_df = pd.read_json("data/training_set.json")
test_df = pd.read_json("data/test_set.json")

## Step 1: Data Preprocessing

#### processing steps
- lowercase conversion
- replace mentions
- replace cashtag
- replace urls
- replace special unicode characters (&, > , < ,' )
- removing stopwords
- lemmatization

In [3]:
############### Declaring Placeholder Variables  ###############

rep = 0  #index for placeholders
p_mentions = [" @mentions ", " "]
p_cashtag = [" @cashtag ", " "]
p_url = [" @url ", " "]
qmark = " qmark "  
emark = " emark "

stop_words = stopwords.words('english')
stop_words.append('stock')
stop_words.append('today')

In [4]:
############### FUNCTIONS  ###############

w_tokenizer = WhitespaceTokenizer()

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

stemmer = PorterStemmer()
def stemming_text(text):
    return ' '.join([stemmer.stem(w) for w in w_tokenizer.tokenize(text)])



In [5]:
############### DATA CLEANING - Training Data ###############
# Convert to lower Case
train_df['tweet'] = train_df['tweet'].str.lower()
# Remove Mentions
train_df['tweet'] = train_df['tweet'].str.replace('([@][\w_-]+)', p_mentions[rep], case=False)
# Remove CashTag or Replace CashTag with Placeholder
train_df['tweet'] = train_df['tweet'].str.replace('([$][a-z]+)', p_cashtag[rep], case=False)
# Remove URL 
train_df['tweet'] = train_df['tweet'].str.replace('http\S+|www.\S+', p_url[rep], case=False)
# Remove Special Characters (&,',>,<)
# Emoticon/Emoji Analysis can be take into account with Emoji Corpus
train_df['tweet'] = train_df['tweet'].str.replace('&amp', " & ", case=False)
train_df['tweet'] = train_df['tweet'].str.replace('&#39;', "'", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&gt;', " ", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&lt;', " ", case=False)
# Special Treatment for ! and ? as it may place special meaning
train_df['tweet'] = train_df['tweet'].str.replace('\?', qmark, case=False)
train_df['tweet'] = train_df['tweet'].str.replace('!', emark, case=False)
#Stopwords Removal
train_df['tweet'] = train_df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
#Stemming
# train_df['tweet'] = train_df['tweet'].apply(stemming_text)
#Lemmatize
train_df['tweet'] = train_df['tweet'].apply(lemmatize_text)

In [6]:
############### DATA CLEANING - Testing Data  ###############
#preprocessing steps taken for test dataset is the same for training dataset

test_df['tweet'] = test_df['tweet'].str.lower()
test_df['tweet'] = test_df['tweet'].str.replace('([@][\w_-]+)', p_mentions[rep], case=False)
test_df['tweet'] = test_df['tweet'].str.replace('([$][a-z]+)', p_cashtag[rep], case=False)
test_df['tweet'] = test_df['tweet'].str.replace('http\S+|www.\S+',  p_url[rep], case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&amp', " & ", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&#39;', "'", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&gt;', " ", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('&lt;', " ", case=False)
test_df['tweet'] = test_df['tweet'].str.replace('\?', qmark, case=False)
test_df['tweet'] = test_df['tweet'].str.replace('!', emark, case=False)
test_df['tweet'] = test_df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
# test_df['tweet'] = test_df['tweet'].apply(stemming_text)
test_df['tweet'] = test_df['tweet'].apply(lemmatize_text)

## Step 2: Spliting Test and Train Dataset

############### Spliting Data Set (Train, Test, Validation)  ###############
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.33, random_state=42 )

## Step 3: Choosing the Right Model

In [7]:
############### TRANSFORIMING MODEL WITH COUNTVECTORIZER  ###############

cv = CountVectorizer( analyzer='word',
                      ngram_range=(1,3),
                      stop_words = 'english')

model = cv.fit_transform(list(train_df["tweet"]))

cv.fit(train_df["tweet"])
X_train = cv.transform(train_df["tweet"])
X_test = cv.transform(test_df["tweet"])
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]


############### TRAINING MODEL WITH LINEAR REGRESSION  ###############
#linear regression model
from sklearn.linear_model import LinearRegression
log_model = LinearRegression(fit_intercept=True)
log_model = log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [8]:
############### TRANSFORIMING MODEL WITH COUNTVECTORIZER  ###############
choices = ['neutral', 'bullish', 'bearish']

trainconditions = [
    (train_df['sentiment'] == 0) , 
    (train_df['sentiment'] <  0) ,
    (train_df['sentiment'] >  0)]

testconditions = [
    (test_df['sentiment'] == 0) , 
    (test_df['sentiment'] <  0) ,
    (test_df['sentiment'] >  0)]

train_df['classes'] = np.select(trainconditions, choices)
test_df['classes'] = np.select(testconditions, choices)


cv2 = CountVectorizer( analyzer='word',
                      ngram_range=(1,3),
                      stop_words = 'english')

model2 = cv2.fit_transform(list(train_df["tweet"]))

cv2.fit(train_df["tweet"])
X_train2 = cv2.transform(train_df["tweet"])
X_test2 = cv2.transform(test_df["tweet"])
y_train2 = train_df["classes"]
y_test2 = test_df["classes"]


############### TRAINING MODEL WITH MultinomialNB  ###############

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train2, y_train2)

# Create the predicted tags: pred
nb_classifier_pred = nb_classifier.predict(X_test2)

## Step 4: Evaluation of Model

In [9]:
def assignClasses(data): 
    value = list()
    for i in data: 
        if i > 0:
            value.append("bullish")
        elif i < 0:
            value.append("bearish")
        else:
            value.append("neutral")
    
    return value 

In [10]:
############### EVALUTATING MODEL WITH LINEAR REGRESSION  ###############
n_y_test = assignClasses(y_test)
n_y_pred = assignClasses(y_pred)

print("MSE: ", mean_squared_error(y_test, y_pred))
print('\n')
print("F1 Macro Avg: ", f1_score(n_y_test, n_y_pred, average='macro'))
print("F1 Micro Avg: ", f1_score(n_y_test, n_y_pred, average='micro'))
print('\n')
print("Classification Report  \n", classification_report(n_y_test, n_y_pred))

MSE:  0.09045563275319476


F1 Macro Avg:  0.5161834964696845
F1 Micro Avg:  0.7949526813880127


Classification Report  
              precision    recall  f1-score   support

    bearish       0.78      0.63      0.70       221
    bullish       0.80      0.91      0.85       401
    neutral       0.00      0.00      0.00        12

avg / total       0.78      0.79      0.78       634



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [11]:
############### EVALUATION MODEL WITH MultinomialNB  ###############

# Calculate the accuracy score: score
score = accuracy_score(y_test2, nb_classifier_pred)
print(score)

# Calculate the confusion matrix: cm
cm = confusion_matrix(y_test2, nb_classifier_pred, labels=['neutral', 'bullish', 'bearish'])
print(cm)

print('\n')
print("F1 Macro Avg: ", f1_score(y_test2, nb_classifier_pred, average='macro'))
print("F1 Micro Avg: ", f1_score(y_test2, nb_classifier_pred, average='micro'))
print('\n')
print("Classification Report  \n", classification_report(y_test2, nb_classifier_pred))

0.7539432176656151
[[  0   2  10]
 [  0  93 128]
 [  0  16 385]]


F1 Macro Avg:  0.464524765729585
F1 Micro Avg:  0.7539432176656151


Classification Report  
              precision    recall  f1-score   support

    bearish       0.74      0.96      0.83       401
    bullish       0.84      0.42      0.56       221
    neutral       0.00      0.00      0.00        12

avg / total       0.76      0.75      0.72       634



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
