In [7]:
import gc
import re
import nltk
import string
import unicodedata
import numpy as np
import pandas as pd

from nltk.util import ngrams
from nltk.twitter import Twitter
from nltk import SnowballStemmer
from nltk.corpus import stopwords 
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

pd.set_option('max_colwidth', -1)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)

base_path = r'C:/Users/sshouche/Desktop/Tweet Classification/'

In [None]:
"""One time processing to get sector tagged data"""

# drop_cols = ['Companies', 'Locations', 'Themes', 'Tweethistoryid']

# for month in ['Apr', 'May', 'Jun', 'Jul', 'Aug']:
#     dfs = pd.read_excel(base_path+r'TweetDump_last6months/TweetsDump_'+str(month)+'.xlsx', sheet_name=None)

#     df = pd.concat(dfs, ignore_index=True)
#     df = df.drop(columns=drop_cols, axis=1)
#     df = df.dropna()

#     df.to_json(base_path+r'TweetDump_last6months/Tweets_'+str(month)+'2019.json')

#     del df, dfs
#     gc.collect()

In [None]:
"""One time processing to combine sector tagged data"""

# for month in ['Apr', 'May', 'Jun', 'Jul', 'Aug']:
#     df = pd.read_json(base_path+r'TweetDump_last6months/Tweets_'+str(month)+'2019.json')
#     if month=='Apr':
#         df_combo = df
#     else:
#         df_combo = pd.concat([df_combo, df], axis=0)

# del df
# gc.collect()

# df_combo = df_combo.reset_index()
# df_combo['Sectors'] = df_combo['Sectors'].apply(lambda x: [str(y).strip() for y in str(x).split(',')])
# df_combo.to_json(base_path+r'TweetDump_last6months/Tweets_Combo.json')

In [None]:
######################## DATA JOINING COMPLETE ########################

In [8]:
"""Data and Helper Functions"""
freqdist = nltk.FreqDist()

# nltk.download('wordnet')
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

new_words = ["one", "two", "three", "four", "five", "six", "seven","eight","nine", "zero", 
             "ten", "twenty", "thirty", "fourty", "fifty", "sixty", "seventy", "eighty", "ninty", 
             "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth",
             "hundred", "thousand", "million", "billion", "trillion", 
             "hundreds", "thousands", "millions", "billions", "trillions",
             "world", "today", "would", "could", "future", "people", 
             '...', 'via', 'see', 'new', 'end', 'amp', 
             'like', 'time', 'need', 'know', 'ever']
stop_words = list(stop_words.union(new_words))

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

tweet_tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)


emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)



In [9]:
"""Helper Functions"""
def get_links(tweet):
    reg_ex = r'((?:(?:https?|ftp|file):\/\/|www\.|ftp\.)*(?:[-a-zA-Z0-9@:%_\+~.#=]{2,256})?([-a-zA-Z0-9@:%_\+~#=]*)\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)*)'
    link_regex = re.compile(reg_ex, re.DOTALL)
    links = re.findall(link_regex, tweet)
    link_list = []
    for link in links:
        link_list.append(link[0])    
    return link_list

def get_tickers(tweet):
    ticker_regex = re.compile(r'\$\w*', re.DOTALL)
    tickers = re.findall(ticker_regex, tweet)
    ticker_list = []
    for ticker in tickers:
        ticker_list.append(ticker[0])    
    return ticker_list

def get_special(tweet, special_prefixes=['@', '#']):
    words_list = []
    for word in tweet.split():
        word = word.strip()
        if word:
            if word[0] in special_prefixes:
                words_list.append(word)
    return words_list

def strip_links(tweet):
    # reg_ex = r'((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)'
    # ((\b)?((https|https|ftp|file):\/\/)|(www\.|ftp\.))?((?:[-a-zA-Z0-9@:%_\+~.#=]{2,256}\.)?([-a-zA-Z0-9@:%_\+~#=]*)\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*))*(\b)?
    reg_ex = r'((?:(?:https?|ftp|file):\/\/|www\.|ftp\.)*(?:[-a-zA-Z0-9@:%_\+~.#=]{2,256})?([-a-zA-Z0-9@:%_\+~#=]*)\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)*)'
    tweet = re.sub(reg_ex, ' ', tweet)
    # link_regex = re.compile(reg_ex, re.DOTALL)
    # links = re.findall(link_regex, tweet)
    # print (links)
    # for link in links:
    #     tweet = tweet.replace(link[0], ' ')    
    return tweet


def strip_special(tweet, special_prefixes=['@', '#']):
    for separator in string.punctuation:
        if separator not in special_prefixes:
            tweet = tweet.replace(separator,' ')
    words_list = []
    for word in tweet.split():
        word = word.strip()
        if word:
            if word[0] not in special_prefixes:
                words_list.append(word)
    return ' '.join(words_list)


def clean_tweets(tweet, keep_list=[]):
    tweet = re.sub(r"(Dr\.)", "Doctor ", tweet)
    tweet = re.sub(r'^RT[\s]+', ' ', tweet) # remove old style retweet text "RT"
    
    if '$' not in keep_list:
        tweet = re.sub(r'\$\w*', ' ', tweet)     # remove stock market tickers like $GE
    
    tweet = re.sub(r'(?:\.?)([\w\-_+#~!$&\'\.]+(?<!\.)(@|[ ]?\(?[ ]?(at|AT)[ ]?\)?[ ]?)\
                   (?<!\.)[\w]+[\w\-\.]*\.[a-zA-Z-]{2,3})(?:[^\w])', ' ', tweet) # remove emails
    tweet = strip_links(tweet) # remove links
    
    if '@' not in keep_list:
        tweet = re.sub(r'\@[\w.]*', ' ', tweet) #remove mentions
    
    if '#' not in keep_list:
        tweet = re.sub(r'\#[\w.]*', ' ', tweet) # remove # from the hashtags
        # tweet = re.sub(r'#', '', tweet)

    tweet = re.sub(r'([\d]+)([\.]{1}[\d]*)*', ' ', tweet) # remove numbers
    tweet = re.sub(r'([\d]+)([\/]{1}[\d]+)', ' ', tweet) # remove fractions
    tweet = re.sub(r'([\d+])', ' ', tweet) # remove integers
    
    tweet = tweet.lower()
    tweet = re.sub(r"what‚Äôs", "what is ", tweet)
    tweet = re.sub(r"\'s", " ", tweet)
    tweet = re.sub(r"\'ve", " have ", tweet)
    tweet = re.sub(r"can't", "can not ", tweet)
    tweet = re.sub(r"n't", " not ", tweet)
    tweet = re.sub(r"i'm", "i am ", tweet)
    tweet = re.sub(r"\'re", " are ", tweet)
    tweet = re.sub(r"\'d", " would ", tweet)
    tweet = re.sub(r"\'ll", " will ", tweet)
    tweet = re.sub(r"\'scuse", " excuse ", tweet)
    tweet = re.sub("\W", " ", tweet) # remove single char words
    tweet = re.sub("\s+", " ", tweet) # remove continuous spaces
    tweet = tweet.strip(" ")
    tweet = unicodedata.normalize('NFKD', tweet).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # symbol left after removing mentions, hashtags, links, emails, etc.
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'/', '', tweet)
    tweet = re.sub(r'‚Äö√Ñ¬∂', '', tweet)
    #replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
    #remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
    tweet = re.sub(r'_',' ', tweet)
    tweet = tweet.strip(" ")
    tweet = ' '.join([x for x in tweet.split()])
    
    return tweet


def process_tweets(tweet, lemma_stem=True):
    tweet = re.sub(r'\#', '', tweet)
    
    tweet = [x for x in tweet.split() if len(x)>1]
    tweet = [x for x in tweet if x not in stop_words]
    tweet = [x for x in tweet if x not in emoticons]
    tweet = [x for x in tweet if x not in string.punctuation]

    tweet = ' '.join(tweet)
    
    # tokenize tweets
    tweet_tokens = tweet_tokenizer.tokenize(tweet)

    bigrams = nltk.bigrams(tweet_tokens)
    bigrams = ['_'.join(x) for x in bigrams]

    tweet_tokens = list(tweet_tokens+bigrams)
    
    tweets_clean = []    
    for word in tweet_tokens:
        if lemma_stem:
            lemmatize_word = lemmatizer.lemmatize(word)
            stem_word = stemmer.stem(lemmatize_word) # stemming word
            freqdist[stem_word]+=1
            tweets_clean.append(stem_word)
        else:
            freqdist[word]+=1
            tweets_clean.append(word)

    return tweets_clean

In [47]:
# df = pd.read_json(base_path+r'TweetDump_last6months/Tweets_Combo.json')
# df['isTagged']=0
# df['sectorTags']=''
# df['verticalTags']=''

# df = df[pd.notnull(df['TweetFulltext'])]
# df.to_json(base_path+r'TweetDump_last6months/Tweets_Sample.json')

In [3]:
"""Load Combined & Tagged data"""
df_combo = pd.read_json(base_path+r'TweetDump_last6months/Tweets_Combo.json')
# df_combo.head()

In [10]:
"""Collect links, mentions, hashtags and tickers, and process Tweets"""
gc.collect()

# Get links, mentions, hashtags, tickers
# df_combo['Links'] = df_combo['TweetFulltext'].apply(lambda x: get_links(str(x)))
# df_combo['Mentions'] = df_combo['TweetFulltext'].apply(lambda x: [str(y).replace(':', '') for y in get_special(str(x), ['@'])])
# df_combo['Hashtags'] = df_combo['TweetFulltext'].apply(lambda x: get_special(str(x), ['#']))
# df_combo['Tickers'] = df_combo['TweetFulltext'].apply(lambda x: get_tickers(str(x)))

# Process tweet
df_combo['CleanText'] = df_combo['TweetFulltext'].apply(lambda x: clean_tweets(str(x), keep_list=['#', '@', '$']))
df_combo['ProcessedText'] = df_combo['CleanText'].apply(lambda x: process_tweets(str(x), lemma_stem=False))

# Process tweet
# df_combo['ProcessedText_ls'] = df_combo['CleanText'].apply(lambda x: process_tweets(str(x)))

df_combo[['index', 'id', 'ProcessedText']].to_json(base_path+r'TweetDump_last6months/Tweets_Input.json')
df_combo.head()

Unnamed: 0,index,id,ScreenName,TweetFulltext,Sectors,CleanText,ProcessedText
0,100000,874696,NealSchaffer,‚ÄòDo brands ever check on the welfare of influencers?‚Äô: YouTube stars confront mental health issues https://t.co/CIS2lnSUI1 #influencermarketing https://t.co/JfHmY8ZrVp,"[Advertising, Application Software]",do brands ever check on the welfare of influencers youtube stars confront mental health issues influencermarketing,"[brands, check, welfare, influencers, youtube, stars, confront, mental, health, issues, influencermarketing, brands_check, check_welfare, welfare_influencers, influencers_youtube, youtube_stars, stars_confront, confront_mental, mental_health, health_issues, issues_influencermarketing]"
1,100006,874702,NealSchaffer,"Microinfluencers do endorsements right, and here are six awesome examples. https://t.co/l8RDTmWuj1 #influencermarketing https://t.co/4XAREMPD5G",[Advertising],microinfluencers do endorsements right and here are six awesome examples influencermarketing,"[microinfluencers, endorsements, right, awesome, examples, influencermarketing, microinfluencers_endorsements, endorsements_right, right_awesome, awesome_examples, examples_influencermarketing]"
10,100088,874784,NealSchaffer,Six Tips for Using Promoted Tweets for Your Ecommerce Marketing Strategy via @ryankhgb #twitter #marketing https://t.co/B4lKQFiZVs https://t.co/n1tOJf2Qjb,"[Advertising, Ecommerce]",six tips for using promoted tweets for your ecommerce marketing strategy via ryankhgb twitter marketing,"[tips, using, promoted, tweets, ecommerce, marketing, strategy, ryankhgb, twitter, marketing, tips_using, using_promoted, promoted_tweets, tweets_ecommerce, ecommerce_marketing, marketing_strategy, strategy_ryankhgb, ryankhgb_twitter, twitter_marketing]"
100,1003910,1778605,jetcitystar,#NasaSocial #NasaSocial #NASA747 #NASAAmes #NASAArmstrong #SOFIAtelescope #NASA #AdrianaSays #CorporateCode #USAF #Veterans #JAGNV #MoffetField #bluecube #HeartMathCoach #johnmaxwellcoach #familia #colegas #entrenamiento #mettingplanners #speaker #speakers #espanol https://t.co/m7L3RjFZ6t,[Aerospace],nasasocial nasasocial nasa nasaames nasaarmstrong sofiatelescope nasa adrianasays corporatecode usaf veterans jagnv moffetfield bluecube heartmathcoach johnmaxwellcoach familia colegas entrenamiento mettingplanners speaker speakers espanol,"[nasasocial, nasasocial, nasa, nasaames, nasaarmstrong, sofiatelescope, nasa, adrianasays, corporatecode, usaf, veterans, jagnv, moffetfield, bluecube, heartmathcoach, johnmaxwellcoach, familia, colegas, entrenamiento, mettingplanners, speaker, speakers, espanol, nasasocial_nasasocial, nasasocial_nasa, nasa_nasaames, nasaames_nasaarmstrong, nasaarmstrong_sofiatelescope, sofiatelescope_nasa, nasa_adrianasays, adrianasays_corporatecode, corporatecode_usaf, usaf_veterans, veterans_jagnv, jagnv_moffetfield, moffetfield_bluecube, bluecube_heartmathcoach, heartmathcoach_johnmaxwellcoach, johnmaxwellcoach_familia, familia_colegas, colegas_entrenamiento, entrenamiento_mettingplanners, mettingplanners_speaker, speaker_speakers, speakers_espanol]"
1000,1018644,1793339,pinstripedline,RT @RoyalNavy: The last Royal Navy helicopter to fly on maritime security operations in Oman has returned to the UK after a ten-year missio‚Ä¶,[Maritime],royalnavy the last royal navy helicopter to fly on maritime security operations in oman has returned to the uk after a ten year missio,"[royalnavy, last, royal, navy, helicopter, fly, maritime, security, operations, oman, returned, uk, year, missio, royalnavy_last, last_royal, royal_navy, navy_helicopter, helicopter_fly, fly_maritime, maritime_security, security_operations, operations_oman, oman_returned, returned_uk, uk_year, year_missio]"


In [5]:
# """Get keyword frequency distribution"""
df_dist = pd.DataFrame.from_dict(freqdist, orient='index')
df_dist = df_dist.reset_index()
df_dist.columns = ['Keyword', 'Frequency']
df_dist = df_dist[df_dist['Frequency']>10]
df_dist.to_csv(base_path+r'TweetDump_last6months/Tweets_Keywords_no_lemma_stem.csv')
df_dist.head()

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [None]:
######################## DATA CLEANUP COMPLETE ########################

In [3]:
"""Data Model"""
import fasttext

N_TAGS = 3

df_combo = pd.read_json(base_path+r'TweetDump_last6months/Tweets_Input.json')

df_combo['Labels'] = df_combo['Sectors'].apply(lambda x: ' '.join(['__label__' + y.replace(' ', '_') for y in x]))

df_combo['InTextData'] = df_combo['Labels'] + ' ' + df_combo['ProcessedText'].apply(lambda x: ' '.join(x))
df_combo['InCleanData'] = df_combo['Labels'] + ' ' + df_combo['CleanText']

df_combo[['InTextData']].dropna().to_csv(base_path+r'InTextData.txt', header=None, index=None, sep=' ')
df_combo[['InCleanData']].dropna().to_csv(base_path+r'InCleanData.txt', header=None, index=None, sep=' ')

In [21]:
df_tags = pd.read_csv(base_path+r'TaggingTweets.csv')
df_tags['Tags'] = df_tags['Tags'].apply(lambda x: x.split('`'))

industry_list = ['ADAS', 'Automotive', 'Banking & Payments', 'Construction', 'Consumer', 'Foodservice', 'Insurance', 
                 'Medical', 'Mining', 'Oil & Gas', 'Packaging', 'Pharma', 'Power', 'Retail', 'Technology', 'Travel & Tourism']

for ind in industry_list:
    df_tags[df_tags['Tags'].apply(lambda x: ind in x)][['Keyword']].dropna().to_csv(base_path+ind+r'.txt', header=None, index=None, sep=' ')

    
df_combo['ProcessedTextCopy']  = df_combo['ProcessedText'].copy()
# df_combo['TagsText'] = df_combo[['Tags']].apply(lambda x: [y for y in x if y in tags])

In [None]:
df_combo['IndustryTags'] = [[]] * len(df_combo)
for ind in industry_list:
    ind_list = df_tags[df_tags['Tags'].apply(lambda x: ind in x)][['Keyword']]
    df_combo['IndustryTags'] = df_combo['IndustryTags'].apply(lambda z: z.append(df_combo['ProcessedText'].apply(lambda x: [ind for y in x if y in ind_list])))

In [None]:
df_combo.head()

In [13]:
clean_model = fasttext.train_supervised(base_path+r'InCleanData.txt', lr=0.05, dim=100, ws=5, epoch=5, word_ngrams=2, loss='softmax', verbose=0)
clean_model.save_model(base_path+r'clean_model.bin')

In [None]:
# text_model = fasttext.train_supervised(base_path+r'InTextData.txt', lr=0.05, dim=100, ws=5, epoch=5, loss='softmax', verbose=0)
# text_model.save_model(base_path+r'text_model.bin')

In [None]:
# hashtag_model = fasttext.train_supervised(base_path+r'InHashtagData.txt', lr=0.05, dim=100, ws=5, epoch=5, loss='softmax', verbose=0)
# hashtag_model.save_model(base_path+r'hashtag_model.bin')

In [None]:
in_clean = '#AI & #machinelearning let us discover solutions in a faster and more agile way than ever before:'
in_clean = clean_tweets(str(in_clean), keep_list=['#'])
print (in_clean)
clean_result = clean_model.predict([in_clean], N_TAGS)
print (clean_result)


In [None]:
# df_combo.sample(5)

In [2]:
import pandas as pd

In [5]:
pd.read_json('Tweets_Combo.json')

Unnamed: 0,index,id,ScreenName,TweetFulltext,Sectors
0,100000,874696,NealSchaffer,‚ÄòDo brands ever check on the welfare of influe...,"[Advertising, Application Software]"
1,100006,874702,NealSchaffer,"Microinfluencers do endorsements right, and he...",[Advertising]
10,100088,874784,NealSchaffer,Six Tips for Using Promoted Tweets for Your Ec...,"[Advertising, Ecommerce]"
100,1003910,1778605,jetcitystar,#NasaSocial #NasaSocial #NASA747 #NASAAmes #NA...,[Aerospace]
1000,1018644,1793339,pinstripedline,RT @RoyalNavy: The last Royal Navy helicopter ...,[Maritime]
10000,128748,903444,evankirstel,Are the Marines getting night-vision drones? h...,"[Aerospace, Maritime, Weaponry]"
100000,1543159,1543158,pietrosd,RT @RathfinnyEstate: Georgia heads up our wine...,[Intermediaries]
100001,1543161,1543160,pietrosd,RT @fabienlaine: Office for the afternoon ü§©üç∑ye...,[Alcoholic Beverages]
100002,1543166,1543165,pietrosd,RT @wineworldnews: This is the #wine you shoul...,[Alcoholic Beverages]
100003,1543168,1543167,pietrosd,RT @DemiCassiani: Take Life One Sip At A Time ...,[Alcoholic Beverages]


In [4]:
"""Model training"""
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

df_combo = pd.read_json(r'Tweets_Input.json')
# df_combo_2 = df_combo.join(pd.DataFrame(mlb.fit_transform(df_combo['Sectors']), columns=mlb.classes_, index=df_combo.index))

y = mlb.fit_transform(df_combo.pop('Sectors'))
X = df_combo['CleanText']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = OneVsRestClassifier(MultinomialNB())

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

print (accuracy_score(y_test, y_pred))


KeyError: 'Sectors'

In [None]:
print (df['Sectors'].shape[0])
print (df['Sectors'].isna().sum())
print (df['Sectors'].nunique())

In [None]:
# x=list(df['Themes'].values)

In [None]:
# x.sort()

In [None]:
df.drop(['Locations','Companies','Sectors','Tweethistoryid','ScreenName'],axis=1,inplace=True)

In [None]:
df.drop(['Themes'],axis=1,inplace=True)

In [None]:
# from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# le.fit(df['Themes'])
# le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
# print(le_name_mapping)

In [None]:
# y=['3D Printing', '5G', 'Artificial Intelligence', 'Cloud', 'Digital Assistants', 'E-commerce', 'Internet of Things', 'Mobile']

In [None]:
Insurance
ADS
Automotive
Banking_Payments
Construction
Consumer
Food_Service
Medical_Devices
Mining
Packaging
Pharma
Power
Retail
Technology
Travel_Tourism
Upstream

In [None]:
df['Sector']=''

In [None]:
df.loc[df['TweetFulltext'].str.contains('|'.join(Upstream))==True,'Sector']='Upstream'

In [None]:
df.head(200)

In [None]:
df.to_excel('Apr_Training_data.xlsx')

In [None]:
df3=pd.read_excel(r'C:\Users\kasandeep\Desktop\Projects\Twitter_Sector_Classification\Sector View Keywords\Consolidated IC and Sector Screeners & Keywords.xlsx',sheet_name=12)

In [None]:
df3.head()

In [None]:
x=list(df3['Screener/Filter word'])

In [None]:
x=[x.strip("#|@|'|\xa0|\u200f") for x in x]

In [None]:
x=[x.replace(" ","") for x in x]

In [None]:
x=set([x.replace("-","") for x in x])

In [None]:
x=set([x.replace("'","") for x in x])

In [None]:
x=set([x.lower() for x in x])

In [None]:
set(x)

In [None]:
# y=[]
# from nltk.stem import PorterStemmer
# stemmer= PorterStemmer()
# for i in x:
#     y.append(stemmer.stem(i))

In [None]:
# set(y)

In [None]:
z=[]
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
for i in x:
    z.append(wordnet_lemmatizer.lemmatize(i))

In [None]:
set(z)

In [None]:
with open(r'C:\Users\kasandeep\Desktop\Projects\Twitter_Sector_Classification\Sector View Keywords\Travel_Tourism_words.txt', mode='wt', encoding='utf-8') as myfile:
    myfile.write('\n'.join(str(line) for line in set(Travel_Tourism)))
    myfile.close()


In [None]:
Insurance
ADS
Automotive
Banking_Payments
Construction
Consumer
Food_Service
Medical_Devices
Mining
Packaging
Pharma
Power
Retail
Technology
Travel_Tourism
Upstream

In [None]:
f = open(r"C:\Users\kasandeep\Desktop\Projects\Twitter_Sector_Classification\Sector View Keywords\Upstream_words.txt", "r",encoding="utf8")
z=Upstream
for x in f:
      z.append(x.strip('\n'))

In [None]:
z

In [None]:
Upstream=[]
for i in set(z):
    if sum(i in s for s in df['TweetFulltext'])>50:
        Upstream.append(i)
        print(i,sum(i in s for s in df['TweetFulltext']))

In [None]:
f = open(r"C:\Users\kasandeep\Desktop\Projects\Twitter_Sector_Classification\Sector View Keywords\Packaging.txt", "r",encoding="utf8")
z=[]
for x in f:
      z.append(x.strip('\n'))

In [None]:
Consumer=[]
for i in set(z):
    if sum(i in s for s in df['TweetFulltext'])>50:
        Consumer.append(i)
        print(i,sum(i in s for s in df['TweetFulltext']))

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from gensim.models import CoherenceModel
#np.random.seed(2018)
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
processed_docs = df['TweetFulltext'].map(preprocess)

In [None]:
processed_docs[:10]

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [None]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score

coherence_model_lda = CoherenceModel(model=lda_model, dictionary=dictionary, texts=processed_docs, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize t# Visualize the topics
# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
# vis

In [None]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [None]:
corpus_tfidf = tfidf[bow_corpus]

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

In [None]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        #model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model=gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


In [None]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=bow_corpus, texts=processed_docs, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()