In [31]:
# Import libraries
import re
import nltk
import pickle
import requests

import numpy as np
import pandas as pd
import datetime as dt 

from bokeh.models import *
from bokeh.plotting import *
from datetime import datetime
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from bokeh.layouts import column, row
from bokeh.palettes import Category20
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet, brown, words

In [32]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/Jean-
[nltk_data]     BaptistePROST/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [4]:
data='Data/russian-troll-tweets/IRAhandle_tweets_'
pickle_files='Pickles/'
WORD_FREQ = 'Data/wordfrea.xlsx'

# Introduction

The goal of this notebook is to represent the tweet by topics (tahter than category). <br>

By looking at the most popular hastag per day, we decided to define 13 arbitrary categories that seemed relevant to characterized the tweets. A first list of words was build for each topic (~ 7 words/topic), then the `Word2Vec` model was used to exten the list. The model enables to find the words that have a large (cosine) similarity the the its word space. The topic became 10 times larger. <br>

The tweet activity (number of tweets per day) for each topic was ploted. Clear peaks of activity were observable. 
Are those peaks related to a specific event?


To figure this out, we used [Wiki Portal](https://en.wikipedia.org/wiki/Portal:Current_events) to retrived the information about the event of a particular day. An automatic event dector was build. For every spike of each topic, we tryed to match an event description scrapped from Wiki Portal.<br>
We were thus able to label some peak of the tweet activity for each category.

# Data Loading & Word2Vec model

In [16]:
#loading data build in Word2vec.ipynb
def load_df(filename=(pickle_files+'df.pkl')):
    with open(pickle_files+'df.pkl', 'rb') as d:
        df=pickle.load(d)
    print(' DafaFrame loaded <--')
    return df

In [17]:
df=load_df()
df.head()

 DafaFrame loaded <--


Unnamed: 0,publish_date,content,account_category,Clean_tweet,hashtags
0,2017-10-01,"""We have a sitting Democrat US Senator on tria...",RightTroll,"[sit, democrat, us, senat, trial, corrupt, bar...",[]
1,2017-10-01,Marshawn Lynch arrives to game in anti-Trump s...,RightTroll,"[marshawn, lynch, arriv, game, anti-trump, shi...",[]
2,2017-10-01,Daughter of fallen Navy Sailor delivers powerf...,RightTroll,"[daughter, fallen, navi, sailor, deliv, power,...",[#BoycottNFL]
3,2017-10-01,JUST IN: President Trump dedicates Presidents ...,RightTroll,"[presid, trump, dedic, presid, cup, golf, tour...",[]
4,2017-10-01,"19,000 RESPECTING our National Anthem! #StandF...",RightTroll,"[respect, nation, anthem, #StandForOurAnthem]",[#StandForOurAnthem]


*Word Freq* is a online dictionary (wordfrequency.info) that gives the occurency the 5000 most commun english words.

In [18]:
#load data frequency from http://www.wordfrequency.info
wordfrequency = pd.read_excel(WORD_FREQ, header=0, index_col=[0], usecols=3).dropna()
wordfrequency.Frequency = wordfrequency.Frequency/(wordfrequency.Frequency.sum())
wordfrequency.rename(columns={'\xa0\xa0\xa0Word' : 'Word', 'Part of speech' : 'PoS'}, inplace=True)
wordfrequency.Word = wordfrequency.apply(lambda row: row['Word'].replace("\xa0\xa0\xa0", ''), axis=1)

wordfrequency.head(5)

Unnamed: 0_level_0,Word,PoS,Frequency
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,the,a,0.066825
2.0,be,v,0.038041
3.0,and,c,0.032569
4.0,of,i,0.031365
5.0,a,a,0.030759


**Word2Vec model**

In [20]:
embedding=100
#non empty clean tweet
tweets=df[df.Clean_tweet.isnull()==False]['Clean_tweet'] #take non empty clean tweet

model = Word2Vec(tweets.tolist(), min_count=1, size=embedding)
vocab_model=model.wv.vocab
print(model)

Word2Vec(vocab=34061, size=100, alpha=0.025)


# Topics

## List

We have identify a list of topic that are frequent and were a debat at the time. We will expore their behavior in the word space hopping to find clusters :
* Black lives matter, police brutality, police violence, blacktwitter, racism, NFL Protest, Jamar Clark, Alfredo Olango 
* music, thefourhorsemen, album 
* Trump, Donald
* Hillary, HeforShe, IamWithHer, crookedHillary 
* religion
* fear, North Korea, Russia, Geopolotics
* voter fraud
* terror, terrorist, attack, chicago, shootings, baltimore, bombings, Chattanooga
* hacking, emails, DNC
* Money, scandal, Wells Fargo, Imran Awan
* election, campaign, GOP, DNC, Dem, vote, I voted, debate, primary, national convention
* alt-righ, alt-left, Charlottesville, neo nazi
* economy, deal, Nafta
* music, thefourhorsemen, album, fm, nowplaying


The list will get expanded thanks to the Word2Vec model. The words that hace a (cosine) similarity larger than a thrshold (0.6) are kept and appended to the list of topics.

In [21]:
#just an example of the use of the Word2vec Model
stemmer = PorterStemmer()
word='unite'
try:
    print('\n'.join([str(w) for w in model.wv.most_similar(stemmer.stem(word), topn=10)]))
except KeyError:
    print('Not in vocabulary or excluded during pre-processing.')

('uniom', 0.6467368602752686)
('elresistencia', 0.5982089042663574)
('deep', 0.53130704164505)
('gulf', 0.5058003664016724)
('bois', 0.4991251230239868)
('swing', 0.4821679890155792)
('islam', 0.4665639400482178)
('trooper', 0.4550928473472595)
('battleground', 0.4468410909175873)
('#JacksonStateShooting', 0.44253113865852356)


### Build extended list

In [35]:
def get_related_words(topics, model, min_similarity=0.6):
    """Get the words that are similar in the word2vec space. 
    min_similarity determines the threshold of similarity
    RETURNS: an extendent list for each topics"""
    topic_extended=[] #new list of words
    
    for topic in topics:
        new_words=[]
       
        for word_raw in topic:
            #modifies the wors as in the preprocessing
            word=stemmer.stem(word_raw.lower())
            new_words.append(word_raw)
            
            try: #if the word is in the vocab
                related=model.wv.most_similar(word, topn=20) 
                new_words+=[related[idx][0] for idx, _ in enumerate(related) if related[idx][1] >= min_similarity]
            
            except KeyError: #the word is not in the vocabulary (anymore)
                #new_words.remove(word_raw) #then remove this word
                continue
                
        topic_extended.append(new_words)
    
    return topic_extended


def hashtag_splitter(wordlist, wordfrequency, word_dictionary):
    '''
    Receives a list of words, wordfreq and dic. For each word, if it is a hashtag (starts with #), removes '#' 
    and check if the word is in dic. If not, try to split it according to uppercase.
    '''
    new_wordlist = []
    for word in wordlist:
        #Is this an hashtag or a word?
        if word.startswith('#'):
            #Removes #
            hashtag = word.replace('#', '')
            #Is the hashtag a word itself?
            if (hashtag.lower() in word_dictionary):
                new_wordlist.append(stemmer.stem(hashtag.lower()))
            #If not, let's split it    
            else:
                split = []
                upper = []
                new_word = []
                cleaned_hash = hashtag.replace('#', '')
                #going over each char in the hashtag
                for idx, char in enumerate(cleaned_hash):
                    #If char is uppercase, store it in potential word beggining
                    if char.isupper():
                        upper.append(char)
                        #Upper means potentially the end of a word, if this is the case, store ex-new_word
                        if len(new_word) !=0:
                            split.append(''.join(new_word))
                            new_word = []
                    else:
                        #if char is not uppercase, not precedated by uppercase
                        if len(upper) == 0:
                            new_word.append(char)
                            #end of hashtag
                            if idx == (len(cleaned_hash)-1):
                                split.append(''.join(new_word))
                        else:
                            #If there was an ongoing uppercased word, we save it and start a new word with the 
                            # previous uppercased char as first char of new word
                            if len(upper) != 0:
                                split.append(''.join(upper[:-1]))
                                new_word.append(upper[-1])
                                new_word.append(char)
                                upper = []
                            else:
                                print('Error')

                new_wordlist = new_wordlist + [stemmer.stem(word.lower()) for word in split if word.lower() not in \
                        (wordfrequency[:150].Word.values.tolist() + [''] + ['i'] + stop_w)]
        else:
            new_wordlist.append(stemmer.stem(word.lower()))
    return new_wordlist

#We will create a new col of stemmed words of wiki's events to compare with tweets keywords
link_numbers=('http', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-') #
stemmer = PorterStemmer()
stop_w=[word.replace('\'','') for word in stopwords.words('english')]+ ['', '&amp', 'amp','rt'] 

def tokenize(text):
    """Split the the tweet into a list of (cleaned words)"""
    text_cleaned = ''.join(ch for ch in text if ch not in '#!"$%&\()*+,./:;<=>?@[\\]^_{|}~\'').split(' ') 
    
    words= [word.lower().encode('ascii',errors='ignore').decode() for word in text_cleaned \
            if not ( (word.startswith(link_numbers)) | (word.endswith(link_numbers)) )]
    
    words=[stemmer.stem(word) for word in words if word not in stop_w ]
        
    if len(words) > 0:
        return words
    else: #tweets that contains only links or emojiis ...
        pass


In [36]:
#the firs word of the list represents the concept of each list.
topics_raw=[['Music', 'thefourhorsemen', 'album', 'nowplaying', 'soundcloud','rap', 'rnb'],\
        ['Trump', 'Donald', 'realdonaldtrump'],\
        ['Hillary', 'HeforShe', 'IamWithHer', 'crookedHillary', 'Clinton', ],\
        ['Religion', 'islam', 'christianism', 'judaism'],\
        ['Fear', 'Korea', 'North', 'NorthKorea', 'Russia', 'Geopolotics', 'kim', 'jong', 'kimjongun'],\
        ['BlackLivesMatters', 'black', 'trayvonmartin', 'mikebrown', 'policeviolence', 'brutality' , 'fuckthepolice', 'cop', \
         'racial', 'blacktwitter', 'colinkaepernick','racism' ],\
        ['Voter', 'fraud'],\
        ['Terrorism', 'terror' 'terrorist', 'attack', 'parisattack', 'londonattack' 'chicago', 'shootings', 'baltimore', 'bombings', 'Chattanooga'],\
        ['Hacking', 'emails', 'DNC'],\
        ['Money', 'scandal', 'WellsFargo', 'Fargo', 'ImranAwan', 'Imran', 'Awan' ],\
        ['Election', 'campaign', 'GOP','gopdebate', 'DNC', 'Dem', 'demdebate', 'vote', 'Ivoted', 'debate', 'primary', 'convention', 'america'],\
        ['Charlottesville','alt-righ', 'alt-left',  'neonazi', 'neo-nazi', 'supremacist', 'unitetheright'], \
        ['Economy', 'deal', 'Nafta']]

print('We have chosen {} topics ({} words):\n    {}'.format(len(topics_raw), len(sum(topics_raw,[])), '\n    '.join([topic[0] for topic in topics_raw])))

topics_treated=[]
for topic in topics_raw:
    topics_treated.append([word.lower() for word in topic])

We have chosen 13 topics (84 words):
    Music
    Trump
    Hillary
    Religion
    Fear
    BlackLivesMatters
    Voter
    Terrorism
    Hacking
    Money
    Election
    Charlottesville
    Economy


**Extending the list:**

In [45]:
topic_extended=get_related_words(topics_raw, model, min_similarity=0.6)

#Retrieve the words from the hashtags expressions
word_dictionary = list(set(words.words()))
topic_extended=[hashtag_splitter(topic, wordfrequency, word_dictionary) for topic in topic_extended]

print('The extended list is {} long.'.format(len(sum(topic_extended,[]))))

The extended list is 489 long.


**Vectorizing tweets** <br>
Each tweet with respect to the topic. If a tweet contains a word contained in the list of a certain topic, then it will have a non null composant in this topic column.

In [46]:
def tweet_topics(df, topic):
    """Add a column corresponding to each topic. Fills 1 if the tweet has a word 
    related to the topic, 0 otherwise """ 
    
    df_tmp=df[df['Clean_tweet'].isnull()==False]['Clean_tweet']
    for topic in topic_extended:
        df[topic[0]]= df_tmp.apply(lambda words: 1 if len(set(words) & set(topic))> 0 else 0)
        df[topic[0]].fillna(0, inplace=True)

tweet_topics(df, topic_extended)

## Plotting - tweet activity

In [47]:
topic_plot = figure(plot_width=950, plot_height=600, x_axis_type='datetime', toolbar_location="above")

colorplot=Category20[len(topic_extended)] #form the bokeh palettes
start_date = df.publish_date.min()
end_date = df.publish_date.max()


for color, topic in enumerate(topic_extended): #iterates over each topic
    topic_tmp=topic[0]
    df_plot=pd.DataFrame(data=df[df[topic_tmp]==1].publish_date.value_counts().sort_index())
    
    source = ColumnDataSource(data=df_plot)
    topic_plt_tmp=topic_plot.line(x='index', y='publish_date', source=source,\
            line_width=2, alpha=0.8, legend=topic_tmp, color=colorplot[color])
    
    topic_plt_tmp.visible=False

topic_plt_tmp.visible=True

topic_plot.legend.location = 'top_left'
topic_plot.legend.click_policy='hide'
topic_plot.title.text = 'Tweeting activity according topic'

hover_tool=tools.HoverTool(
    tooltips=[
        ('Date', '@index{%b %d, %Y}'),
        ('Number of tweets','@publish_date')],

    formatters={
        'index' : 'datetime', # use 'datetime' formatter for 'date' field
        'publish_date' : 'printf',   },   # use 'printf' formatter for 'adj close' field

    # display a tooltip whenever the cursor is vertically in line with a glyph
    mode='vline',
    attachment='above',
    show_arrow=True,
)

topic_plot.tools.append(hover_tool)

output_notebook()

In [48]:
show(topic_plot)

In [49]:
print('#-Topic-# : {} '.format('\n-----\n#-Topic-# : '.join([str(topic) for topic in topic_extended])))

#-Topic-# : ['music', 'song', 'hip-hop', 'thefourhorsemen', 'album', 'ep', 'platinum', 'kendricklamar', 'drake', 'eyez', 'collab', 'solo', 'chancetherapp', 'song', 'djkhale', 'lp', 'donaldglov', 'nowplay', 'music', 'featur', 'spotifi', 'mixtap', 'new', 'unsign', 'rappersiq', 'artist', 'hiphop', 'dancehal', 'musician', 'regga', 'hip', 'hop', 'friday', 'indi', 'paid', 'vibe', 'awesom', 'spotifi', 'soul', 'soundcloud', 'soundcloud', 'spinrilla', 'datpiff', 'promot', 'audiomack', 'mixtap', 'datpiff', 'spotifi', 'datpiff', 'bandcamp', 'soundcloud', 'promo', 'audiomack', 'dj', 'promo', 'widget', 'spinrilla', 'reverbn', 'instagram', 'click', 'rap', 'migo', 'tidalhifi', 'hip-hop', 'eminem', 'duo', 'rnb', 'dancehal', 'stackorstarvdj', 'regga', 'danceh', 'spotifi', 'bizdatroof', 'freddiegibb', 'ricorecklezz', 'tha', 'turn', 'u', 'pi', 'real', 'music', 'monday', 'djkingassassin', 'soul', 'itun', 'tune', 'djgumba', 'tymon', 'jamaica', 'schoolboyq', 'roster', 'topstarhiphopra']
-----
#-Topic-# : ['

# Web scrapping

We use Wikipedia Portal which lists every event that have happened for each day. The html template is always the same enabling us to efficiently scrap it.



We want to detect the event related to the spike of tweets:
* For each categry, a threshold is define to localize the date spikes. 
* Wikipedia Portal is scrapped at the url of the date. The text follows the same pre-process as the tweets words.
* A matching of the subsection of the web page and the word of the list is done.
* We retrieve the description of the event if it its related to the topic.

In [51]:
#Getting web page
WIKI_PORTAL = "https://en.wikipedia.org/wiki/Portal:Current_events/"

def event_scrapper(date):
    """Retrieve the information of the events that happened around a date window"""
    
    #take two days before and one day after the date
    dates = [date + dt.timedelta(day,0) for day in [-2,-1,0,1]]
    #daily events are stored in df
    event_df = pd.DataFrame(columns=['Date', 'Description', 'Category', 'Link'])
    
    for date in dates:
        #strftime gives 0-padded days...
        year = date.strftime("%Y")
        month = date.strftime("%B")
        day = re.sub("^[0]", "", date.strftime("%d"))  #removing 0 at beggining of day

        #right format for wiki portal and requesting html
        url_date = year + "_" + month + "_" + day
        r = requests.get(WIKI_PORTAL + url_date)
        
        #print('Response status code: {0}\n'.format(r.status_code))
        
        page_body = r.text
        soup = BeautifulSoup(page_body, 'html.parser')

        #For every category of events (dt), we look for every events from bullet list (li)
        if soup.findAll("dt"):
            
            for category in soup.findAll("dt"):

                li = category.findNext("li")
                while li:
                    #replacing \n, and removing sources: text. e.g. (CNN). 
                    #we decided to split text using regex and keeping only text before first source
                    full_text = re.split(".\s\(", li.getText().replace('\n', '. '))
                    no_source = full_text[0]
                    new_event = pd.DataFrame({'Date': [date], \
                                              'Description': [no_source],\
                                              'Category': [category.getText()], \
                                              'Link': [li.a]})


                    event_df = event_df.append(new_event, ignore_index=True, sort=False)
                    li = li.findNextSibling("li")
            
            #event_df['Stemmed_Content']=event_df.apply(lambda row: tokenize(row['Description']), axis=1)
            
        
        else: #other template of wiki portal 
            for category in soup.findAll("p"):
                category_title=category.findNext().getText()
                li = category.findNext("li")
                while li:
  
                    full_text = re.split(".\s\(", li.getText().replace('\n', '. '))
                    no_source = full_text[0]
                    new_event = pd.DataFrame({'Date': [date], \
                                              'Description': [no_source],\
                                              'Category': [category_title], \
                                              'Link': [li.a]})


                    event_df = event_df.append(new_event, ignore_index=True, sort=False)
                    li = li.findNextSibling("li")

        
        #stemming the content of the events description
        event_df['Stemmed_Content']=event_df.apply(lambda row: tokenize(row['Description']), axis=1)

    return event_df


def event_detector(df, topic_extended):
    """Detect the event by web scrapping. Returns the dataframe of events' description
    per day, the topic and the recall of the process (number of event that match over
    the number of tweet spikes)"""
    
    events_df=pd.DataFrame(columns=['Date', 'Topic', 'Event'])
    nb_dates=0
    nb_detected=0
    
    for topic in topic_extended:
        topic_tmp=topic[0]
        print(topic_tmp)
        
        #create dataframe
        df_topic=pd.DataFrame(data=df[df[topic_tmp]==1].publish_date.value_counts().sort_index().reset_index())
        df_topic.rename(columns={'index':'Date', 'publish_date': 'Count'}, inplace=True)

        #get date  where tweet activity is above the thresold
        threshold=df_topic.Count.mean() + 2*df_topic.Count.std()
        dates_thr=df_topic[df_topic.Count >= threshold].index
        #less than threshold?
        dates=[df_topic.loc[date,'Date'] for date in dates_thr if df_topic.loc[date-1,'Count'] < threshold]
        
        for date in dates:
            nb_dates+=1 #counter for recall score
            event=event_scrapper(date)
            
            try: 
                #matching the description and the words of the topic
                matching_score=event.Stemmed_Content.apply(lambda words: len(set(words) & set(topic)))
                max_match=matching_score.max() 
                
                if (max_match > 0) : #avoid empty results
                    nb_detected+=1#counter for recall score
                   
                    #if two descriptions have the same matching score
                    for idx_max_match in matching_score[matching_score==max_match].index:
                    
                        tmp=pd.DataFrame({'Date': [event.loc[idx_max_match, 'Date']], 'Topic': [topic_tmp], 'Event': [event.loc[idx_max_match, 'Description']]})
                        events_df=events_df.append(tmp, ignore_index=True, sort=False)
                        
            
            except AttributeError: #track of potential error
                print(date)
                continue
            
            except TypeError:
                print('No content has been found during a scrap')
    
    events_df.drop_duplicates(inplace=True)
    return events_df, nb_detected/nb_dates 
    
    

In [52]:
events_df, recall =event_detector(df, topic_extended)

music
trump
hillari
religion
fear
blacklivesmatt
voter
terror
hack
money
elect
charlottesvil
economi


In [53]:
print('The event detector matched {:0.2f}% ({}) of the tweet peak'.format(100*recall, events_df.shape[0]))

The event detector matched 80.51% (210) of the tweet peak


### Plotting

In [61]:
plot_threshold=True
topic_plot = figure(plot_width=950, plot_height=600, x_axis_type='datetime', y_range=[-10,4800], toolbar_location=None)

colorplot=Category20[len(topic_extended)] #form the bokeh palettes
start_date = df.publish_date.min()
end_date = df.publish_date.max()
ymax=[]



for color, topic in enumerate(topic_extended): #iterates over each topic
    topic_tmp=topic[0]
    
    #tweets
    df_plot=pd.DataFrame(data=df[df[topic_tmp]==1].publish_date.value_counts().sort_index())
    ymax.append(df_plot.publish_date.max())
    
    
    if plot_threshold:
        threshold=df_plot.publish_date.mean() + 2* df_plot.publish_date.std()
        topic_plt_tmp=topic_plot.line(x=[start_date,end_date],\
                                      y=[threshold,threshold],\
                                      legend=topic_tmp, \
                                      color=colorplot[color], alpha=0.5, )
        topic_plt_tmp.visible=False
    
    source = ColumnDataSource(data=df_plot)
    topic_plt_tmp=topic_plot.line(x='index', y='publish_date', source=source,\
            line_width=2, alpha=0.9, legend=topic_tmp, color=colorplot[color])
    
    hover_tool=tools.HoverTool(
        tooltips=[
            ('Date', '@index{%b %d, %Y}'),
            ('Number of tweets','@publish_date')],
        formatters={
            'index' : 'datetime', 
            'publish_date' : 'printf',},   
        mode='vline',
        attachment='above',
        renderers=[topic_plt_tmp])

    topic_plot.tools.append(hover_tool)
    topic_plt_tmp.visible=False

    #events
    event_plot=events_df[events_df['Topic']==topic_tmp]
    event_plot['Y']=4300
    source_event=ColumnDataSource(data=event_plot)
    event_plot_tmp=topic_plot.scatter(x='Date', y='Y', source=source_event,\
                                      legend=topic_tmp, fill_color=colorplot[color], color=None)
    
    hover_event=tools.HoverTool(
        tooltips=[('Event', '@Event'),\
                  ('Date', '@Date{%b %d, %Y}')],
        formatters={'Event' : 'printf', 'Date' : 'datetime' },  
        mode="vline",
        attachment='below',
        show_arrow=False,
        renderers=[event_plot_tmp])

    topic_plot.tools.append(hover_event)
    event_plot_tmp.visible=False
    
#topic_plt_tmp.visible=True


topic_plot.legend.location = 'top_center'
topic_plot.legend.orientation = "horizontal"
topic_plot.legend.click_policy='hide'
topic_plot.legend.label_text_font_size='8pt'

topic_plot.title.text = 'Tweeting activity according topic'
topic_plot.yaxis.bounds=(0,max(ymax))

output_file('Topic_Events.html')
#output_notebook()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [62]:
show(topic_plot)

In [None]:
import nltk
from nltk.corpus import brown, words
nltk.download('words')

lemmatizer = WordNetLemmatizer()

WORDS_TO_ADD=['donald', 'trump', 'dem', 'gop', 'hillary', 'clinton', 'trayvon', 'tamir', 'rice' \
              'islam', 'fuck', 'nfl', 'kaepernick', 'dnc', 'charlottesville', 'korea', 'sacramento',\
              'blm', 'alt', 'kkk', 'berkeley', 'music', 'michael', 'brown', 'mike']
word_dictionary = list(set(words.words()))

for word in WORDS_TO_ADD:
    word_dictionary.append(word)

for alphabet in "bcdefghjklmnopqrstuvwxyz":
    word_dictionary.remove(alphabet)

    
def retag(tag):
    '''
    Receives tags as treebank (VBD) and changes it in wordnet (v) if exsits
    '''
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
    
def best_hash_split(hashtag, wordfrequency):
    '''
    Receives a hashtag and the frequency of the most common 5000 english words. If the word or its lemma is
    not in dictionnary, then it splits it. Once split, it assess the most probable split based on word freq. 
    Returns the hashtag if the word (or lemma) is in dictionnary. Returns the best split if it found one.
    If it could find satisfactory split, it returns nothing
    '''
    #getting all possible splits
    if (hashtag not in word_dictionary) | (stemmer.stem(hashtag) not in word_dictionary):
        all_splits = split_hashtag_to_words_all_possibilities(hashtag)
        max_prob = 0
        i = 0
        best_split = 0
        for possible_splits in all_splits:
            #tag the splits
            word_pos = nltk.pos_tag(possible_splits)
            probabilities = []
            for (word, pos) in word_pos:
                #if the word is already in wordfrequency, get its frequency
                if (wordfrequency.Word == word).any():
                    probabilities.append(wordfrequency[wordfrequency.Word.str.lower() == word].max()[2])
                else:

                    freq = (wordfrequency[wordfrequency.Word.str.lower() == lemmatizer.lemmatize\
                                                                            (word, pos=retag(pos))].max()[2])
                    #otherwise, if lemma is in wordfrequ get its frequency
                    if freq > 0:
                        probabilities.append(freq)
                    #if not, keep lowest freq we find.
                    else : 
                        probabilities.append(wordfrequency.Frequency.min())
            split_prob = np.prod(probabilities)
            if split_prob > max_prob:
                max_prob = split_prob
                best_split = i
            i = i + 1
            print(max_prob)
        if (len(all_splits) != 0) & (max_prob > np.exp(-25)):            
            return all_splits[best_split]  
        else:
            return [hashtag]
    else:
        return [hashtag]
               
    
def split_hashtag_to_words_all_possibilities(hashtag):
    '''
    Receives a hashtag and returns all the possible splits.
    '''
    hashtag = hashtag.lower()
    all_possibilities = [] 
    split_posibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag)+1))]
    possible_split_positions = [i for i, x in enumerate(split_posibility) if x == True]
    for split_pos in possible_split_positions:
        split_words = []
        word_1, word_2 = hashtag[:len(hashtag)-split_pos], hashtag[len(hashtag)-split_pos:]
        if word_2 in word_dictionary:
            split_words.append(word_1)
            split_words.append(word_2)
            all_possibilities.append(split_words)

            another_round = split_hashtag_to_words_all_possibilities(word_2)

            if len(another_round) > 0:
                all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in zip([word_1]*len(another_round), another_round)]
        else:
            another_round = split_hashtag_to_words_all_possibilities(word_2)
            
            if len(another_round) > 0:
                all_possibilities = all_possibilities + [[a1] + a2 for a1, a2, in zip([word_1]*len(another_round), another_round)]
                      
    return all_possibilities
