### RUN TOPIC MODELLING PIPELINE ###

In [None]:
from Pre_Processing import *
import pandas as pd
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
import os




df = pd.read_csv("december_2017.csv")
df.shape



In [None]:
#GET THE MAX AND MIN DATES IN TEXTUAL DATA

MIN_DATE = str(min(df["timestamp"])).split(' ')[0]
MAX_DATE = str(max(df["timestamp"])).split(' ')[0]


#GET THE CORRESPONDING STEEM PRIC

price_data = pd.read_csv("SteemCoinFull.csv",sep=";")
price_data["Date"] = pd.to_datetime(price_data["Date"])

price_data = price_data[(price_data["Date"]>=MIN_DATE) & (price_data["Date"]<=MAX_DATE)]
price_data = price_data.sort_values(by="Date")
price_data = price_data.reset_index()
price_data = price_data.drop("index",axis=1)
price_data


In [None]:
# CLEAN, DETECT LANGUAGE AND GET ONLY ENGLISH ONES

df = pre_processing(df,"body","cleaned_text")

#TOKENIZE, REMOVE STOPWORDS, CREATE BI-GRAMS AND LEMMATIZED

lemmatized = prepare_to_LDA(df,"cleaned_text",tokenizer=sent_to_words, remove_stopwords = remove_stopwords, make_bigrams = make_bigrams,
                  lemmatization = lemmatization)

# RUN TOPIC MODELLING

topics,coherence_values, model_list,corpus,id2word = modelling_LDAmallet(lemmatized, n_topics=[8,10,12,14,16,18], random_state=10, chunksize=100, passes=5,
                                              model="MALLET")

n_topics=[8,10,12,14,16,18]

max_coh = max(coherence_values)

model_max_ind = coherence_values.index(max_coh)


In [None]:
# PLOT COHERENCE VALUES

plot_coh_val(n_topics,coherence_values)


In [None]:
# VISUALIZE TOPICS

from pyLDAvis import gensim as ge
pyLDAvis.enable_notebook()
vis = ge.prepare(model_list[1], corpus, id2word)
vis

In [None]:
## ASSIGN TOPICS, CALCULATE DAILY GROWTH OF THE TOPICS AND STEEM PRICE

df_dominant_topic = format_topics_sentences(model_list[1],corpus,texts=df["cleaned_text"].values.tolist())

daily_topics = calculate_dailytopic_growth(df,df_dominant_topic)

steem_daily_growth = steem_growth(price_data)



In [None]:
## SEE THE TOPICS AND KEYWORDS

get_topics(df_dominant_topic)

In [None]:
#DEFINE THE RANGE YOU LIKE AND CALCULATE CORRELATION BETWEEN TOPIC GROWTH AND STEEM PRICE

MIN_DATE = "2017-12-01"
MAX_DATE = "2017-12-31"

corr_per_topic,corr_df = calculate_correlation(daily_topics,steem_daily_growth,MIN_DATE,MAX_DATE)

In [None]:
### COMPUTE CROSS CORRELATION

compute_cross_cor(corr_per_topic)


### POLARITY CALCULATION ###

In [None]:
#JOIN TIMESTAMP AND ASSIGNED TOPICS

clean_df = pd.concat([df[["timestamp","body","cleaned_text"]],df_dominant_topic.Dominant_Topic],axis=1)
clean_df

In [None]:
##CHOOSE THE TOPICS YOU LIKE HERE. IF NOT PRESENT, CREATE ONE.

first_topic = 8
second_topic = 9
third_topic = 12
fourth_topic = 13

## IF ADDED A NEW TOPIC, PLEASE MODIFY THE CODE BELOW ACCORDINGLY.

In [None]:
data_to_polarity = clean_df[(clean_df["Dominant_Topic"]==first_topic) | 
                            (clean_df["Dominant_Topic"]==second_topic) | (clean_df["Dominant_Topic"] == third_topic) | 
                            (clean_df["Dominant_Topic"] == fourth_topic)][["timestamp","body","cleaned_text","Dominant_Topic"]].copy()
data_to_polarity = data_to_polarity.reset_index().drop("index",axis=1).sort_values(by="timestamp")

data_to_polarity

### LOAD PRE-TRAINED MODEL ###

In [None]:
os.chdir(main_path+"/twitter_model")



from keras.models import load_model
loaded = load_model('model.h5')
import tqdm

from gensim.models import KeyedVectors
from gensim import models

kv_model = KeyedVectors.load('model.w2v', mmap='r')

import pickle
encoder = open('encoder.pkl', 'rb')    
pickle.load(encoder)


import json
from keras_preprocessing.text import tokenizer_from_json

with open('tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
    
    
def predict(text, include_neutral=True):
    start_at = time.time()
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = loaded.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score, include_neutral=include_neutral)

    return {"label": label, "score": float(score),
       "elapsed_time": time.time()-start_at}  



import time
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
SEQUENCE_LENGTH = 300


POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

#PREDICT

def decode_sentiment(score, include_neutral=True):
    if include_neutral:        
        label = NEUTRAL
        if score <= SENTIMENT_THRESHOLDS[0]:
            label = NEGATIVE
        elif score >= SENTIMENT_THRESHOLDS[1]:
            label = POSITIVE

        return label
    else:
        return NEGATIVE if score < 0.5 else POSITIVE
    
    
    
def calculate_polarity(data_to_polarity):
    
    import tqdm
    
    res_dict = {}
    text = data_to_polarity.cleaned_text.tolist()
    
    for index in tqdm.tqdm(range(len(text))):
        
        res = predict(text[index])["score"]
        res_dict[index] = res
    
    result = pd.DataFrame.from_dict(res_dict,orient="index")
    result = result.rename({0:"score"},axis=1)
    data_to_polarity["score"] = result["score"].copy()
    
    data_to_polarity["timestamp"] =pd.to_datetime(data_to_polarity["timestamp"].apply(lambda row: row.split(' ')[0]))
        
    
    
    return data_to_polarity

In [None]:
#import tqdm

polarity_result = calculate_polarity(data_to_polarity)
polarity_result