In [1]:
import os 

# change default directory
os.chdir(r"C:\Users\ngwei\Dropbox\UM_Master\UM_Sem3\WQD7005_Data_Mining\WQD7005_Project")

In [2]:
# Import library
import pandas as pd
import numpy as np
import pymongo
import json
import datetime
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import wordnet
from nltk.tokenize import WordPunctTokenizer
import string
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
import time

### Language Processing

In [34]:
tok = WordPunctTokenizer()
def tweet_cleaner(text):
    rep_and = str(text).lower().replace('&',"and")
    stripped = rep_and.replace('\n\n'," ")
    stripped_1 = stripped.replace('\n'," ")
    stripped_2= stripped_1.replace(u"\\u201c", "")
    replace_key = stripped_2.replace('#covid19','coronavirus')
    strip_link = re.sub('https?://[A-Za-z0-9./]+','',replace_key)
    no_hash = re.sub(r'@[A-Za-z0-9]+','',strip_link)
    no_tag = re.sub(r'#[A-Za-z0-9]+','',no_hash)
    no_slash = re.sub(r'\\[A-Za-z0-9]+','',no_tag)
    letters_only = re.sub("[^a-zA-Z]", " ", no_slash)
    remove_word = re.sub(r'\b\w{1}\b', '', letters_only)
    words = tok.tokenize(remove_word)
    return (" ".join(words)).strip()


def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

    
lemmatizer = WordNetLemmatizer()
def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)


stemmer = SnowballStemmer("english")
def stemming(text):
    text = [stemmer.stem(word) for word in word_tokenize(text)]
    return " ".join(text)


def prepare_text_lda(text):
    stop = stopwords.words('english')
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop]
    return tokens


def getSentiment(df_clean):
    
    # Calculate polarity of each review
    sentimentAnalyzer = SentimentIntensityAnalyzer()
    sentiment = []
    sentiment_index = []
    
    for review in df_clean['stemming_text']:
        
        ss = sentimentAnalyzer.polarity_scores(review)
        
        if ss['compound'] >= 0.05 : 
            # print("Positive") 
            sentiment.append("Positive")
            sentiment_index.append(1)
        elif ss['compound'] <= -0.05 : 
            # print("Negative") 
            sentiment.append("Negative")
            sentiment_index.append(-1)
        else : 
            # print("Neutral") 
            sentiment.append("Neutral")
            sentiment_index.append(0)
    
    df_clean["sentiment"] = sentiment
    return df_clean

### Machine Learning Algorithms

In [93]:
# Logistic Regression with TF-IDF vectorizer
def LogReg(): 
    start_time = time.time()
    logreg = LogisticRegression(C=0.1, solver='lbfgs', multi_class='auto', max_iter=3000).fit(X_train, y_train)
    log_y_pred = logreg.predict(X_test)
    log_accuracy = accuracy_score(y_test, log_y_pred)
    log_precision = precision_score(y_test, log_y_pred, average='macro')
    log_recall = recall_score(y_test, log_y_pred, average='macro')
    log_f1 = f1_score(y_test, log_y_pred, average='macro')
    log_time = time.time() - start_time
    return ['Logistic Regression', log_accuracy, log_precision, log_recall, log_f1, log_time]

# Decision Tree with TF-IDF vectorizer
def DecTree(): 
    start_time = time.time()
    decision_tree = DecisionTreeClassifier().fit(X_train, y_train)
    decisionTree_y_pred = decision_tree.predict(X_test)
    decisionTree_accuracy = accuracy_score(y_test, decisionTree_y_pred)
    decisionTree_precision = precision_score(y_test, decisionTree_y_pred, average='macro')
    decisionTree_recall = recall_score(y_test, decisionTree_y_pred, average='macro')
    decisionTree_f1 = f1_score(y_test, decisionTree_y_pred, average='macro')
    decisionTree_time = time.time() - start_time
    return ['Decision Tree', decisionTree_accuracy, decisionTree_precision, decisionTree_recall, decisionTree_f1, decisionTree_time]


# Random Forest with TF-IDF vectorizer
def RandomForest():
    start_time = time.time()
    random_forest = RandomForestClassifier(n_estimators = 100).fit(X_train, y_train)
    rf_pred = random_forest.predict(X_test)
    rf_accuracy = accuracy_score(y_test, rf_pred)
    rf_precision = precision_score(y_test, rf_pred, average='macro')
    rf_recall = recall_score(y_test, rf_pred, average='macro')
    rf_f1 = f1_score(y_test, rf_pred, average='macro')
    rf_time = time.time() - start_time
    return ['Random Forest', rf_accuracy, rf_precision, rf_recall, rf_f1, rf_time]


# Naive Bayes with TF-IDF vectorizer
def NaiveBayes():
    start_time = time.time()
    multinomialNB = MultinomialNB().fit(X_train, y_train)
    multinomialNB_pred = multinomialNB.predict(X_test)
    multinomialNB_accuracy = accuracy_score(y_test, multinomialNB_pred)
    multinomialNB_precision = precision_score(y_test, multinomialNB_pred, average='macro')
    multinomialNB_recall = recall_score(y_test, multinomialNB_pred, average='macro')
    multinomialNB_f1 = f1_score(y_test, multinomialNB_pred, average='macro')
    multinomialNB_time = time.time() - start_time
    return ['Multinomial Naive Bayes', multinomialNB_accuracy, multinomialNB_precision, multinomialNB_recall, multinomialNB_f1, multinomialNB_time]



# XGBoost with TF-IDF vectorizer
def xgBoost():
    start_time = time.time()
    xgb = XGBClassifier().fit(X_train, y_train)
    xgb_y_pred = xgb.predict(X_test)
    xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
    xgb_precision = precision_score(y_test, xgb_y_pred, average='macro')
    xgb_recall = recall_score(y_test, xgb_y_pred, average='macro')
    xgb_f1 = f1_score(y_test, xgb_y_pred, average='macro')
    xgb_time = time.time() - start_time
    return ['XG Boost', xgb_accuracy, xgb_precision, xgb_recall, xgb_f1, xgb_time]
    

    
# SVM with TF-IDF vectorizer
def svmLinear():
    start_time = time.time()
    svm = SVC(gamma='auto', kernel='linear').fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    svm_accuracy = accuracy_score(y_test, svm_pred)
    svm_precision = precision_score(y_test, svm_pred, average='macro')
    svm_recall = recall_score(y_test, svm_pred, average='macro')
    svm_f1 = f1_score(y_test, svm_pred, average='macro')
    svm_time = time.time() - start_time
    return ['SVM (Linear)', svm_accuracy, svm_precision, svm_recall, svm_f1, svm_time]


#### References: 

1. Flask + Charts: https://blog.ruanbekker.com/blog/2017/12/14/graphing-pretty-charts-with-python-flask-and-chartjs/

In [None]:
from flask import Flask
from flask import render_template



app = Flask(__name__)



## Pre-load data and run text-processing
df_tmp = pd.read_csv('dataset/rawdata.csv', index_col=None, header=0)
df_tmp['full_text'] = df_tmp['full_text'].apply(tweet_cleaner)
df_tmp['lemmatize_text'] = df_tmp['full_text'].apply(lambda x: lemmatize_sentence(x))
df_tmp['stemming_text'] = df_tmp['lemmatize_text'].apply(lambda x: stemming(x))
df_tmp = getSentiment(df_tmp)


## Preparing for ML Algorithms
# Split into testing set and training set
x_train, x_test, y_train, y_test = train_test_split(
    df_tmp['stemming_text'], df_tmp['sentiment'], random_state=0)
# Vectorize X_train
vectorizer = TfidfVectorizer(min_df = 5).fit(x_train)
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)
feature_names = vectorizer.get_feature_names()


## Home Tab

@app.route('/')
@app.route('/index')
def myIndex():
    postTitle = "WQD7005 Flask Deployment"
    posts = [
        {'Name': 'Project Description', 'Content': 'Sentiment Analysis using social media data from Twitter'},
        {'Name': 'Project Description', 'Content': 'Dataset has been loaded, click from menus to check things out!'}
    ]
    return render_template('index.html', title='Home', postTitle=postTitle, posts=posts)


## Sentiment Analysis Tab

@app.route('/sentiment')
def mySentiment():
    
    postTitle = "Sentiment Analysis on Tweets Data about Donald Trump:"
    tmpData = [{'Name': i, 'Content': str(len(df_tmp[df_tmp['sentiment']==i]))} for i in set(df_tmp['sentiment'])]
    pieValues = [str(len(df_tmp[df_tmp['sentiment']==i])) for i in set(df_tmp['sentiment'])]
    pieLabels = set(df_tmp['sentiment'])
    pieColours = ["#F7464A", "#46BFBD", "#FDB45C"]
    return render_template('sentiment.html', title='Sentiment', postTitle=postTitle, posts=tmpData, max=17000, set=zip(pieValues, pieLabels, pieColours))


## Machine Learning Tabs

## Logistic Regression
@app.route('/logreg')
def myLogReg():
    
    postTitle = "Logistic Regression"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    LogRegValues = LogReg()
    tmpData = [{'Name': columns[i], 'Content': LogRegValues[i]} for i in range(len(columns))]
    return render_template('logreg.html', title='Sentiment', postTitle=postTitle, posts=tmpData)

## Decision Tree
@app.route('/decisiontree')
def myDecTree():
    
    postTitle = "Decision Tree"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    DecTreeValues = DecTree()
    tmpData = [{'Name': columns[i], 'Content': DecTreeValues[i]} for i in range(len(columns))]
    return render_template('decisiontree.html', title='Sentiment', postTitle=postTitle, posts=tmpData)

## Random Forest
@app.route('/randomforest')
def myRandomForest():
    
    postTitle = "Random Forest:"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    rfValues = RandomForest()
    tmpData = [{'Name': columns[i], 'Content': rfValues[i]} for i in range(len(columns))]
    return render_template('randomforest.html', title='Sentiment', postTitle=postTitle, posts=tmpData)

## Naive Bayes
@app.route('/naivebayes')
def myNB():
    
    postTitle = "Naive Bayes:"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    nbValues = NaiveBayes()
    tmpData = [{'Name': columns[i], 'Content': nbValues[i]} for i in range(len(columns))]
    return render_template('naivebayes.html', title='Sentiment', postTitle=postTitle, posts=tmpData)

## XG Boost
@app.route('/xgboost')
def myXGB():
    
    postTitle = "XG Boost:"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    xgbValues = xgBoost()
    tmpData = [{'Name': columns[i], 'Content': xgbValues[i]} for i in range(len(columns))]
    return render_template('xgboost.html', title='Sentiment', postTitle=postTitle, posts=tmpData)

## SVM (Linear)
@app.route('/svmlinear')
def mySVM():
    
    postTitle = "SVM (Linear)"
    columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Time (s)']
    svmValues = svmLinear()
    tmpData = [{'Name': columns[i], 'Content': svmValues[i]} for i in range(len(columns))]
    return render_template('svmlinear.html', title='Sentiment', postTitle=postTitle, posts=tmpData)




app.run(host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [14/Jun/2020 12:32:18] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:20] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:24] "GET /randomforest HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:25] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:27] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:28] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:29] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:31] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:32:32] "GET /decisiontree HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:33:03] "GET /logreg HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:33:05] "GET /logreg HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:33:05] "GET /logreg HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020 12:33:07] "GET /naivebayes HTTP/1.1" 200 -
127.0.0.1 - - [14/Jun/2020