In [1]:
import pandas as pd
import spacy
import numpy as np
nlp_parser = spacy.load('en_core_web_sm', disable=["attribute_ruler", "tagger", "lemmatizer", "ner"])
nlp = spacy.load('en_core_web_sm')
spacy_stopwords = nlp.Defaults.stop_words



In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.2/103.2 KB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24


In [None]:
# read the ULR for contractions https://www.educative.io/answers/how-to-deal-with-contractions-in-nlp
# this package helps to handle words like aren't, wasn't, ...etc.
import contractions

contractions.fix("haven't, aren't")
# We use contractions.fix() to expand the shortened words, and append them to the expanded_text in a loop

'have not, are not'

In [None]:
def preprocess_sent(text_string):
    
    # text_string which may contain multiple sentences
    
    # detect sentences using nlp_parser
    sents = []
    doc = nlp_parser(text_string)
    for sent in doc.sents:
        
        # lowercase sentence and resolve contractions like ain't, don't cant
        sent = contractions.fix(sent.text)
        sent = sent.lower()
        sents.append(sent)
    
    # preprocess each sentence as a batch
    docs = nlp.pipe(sents)
    new_sents = ' '
    
    for doc in docs:
    
        # lemmatise sent and filter out punctuations
        sent = ' '.join([token.lemma_ for token in doc if not token.is_punct])

        new_sents =  new_sents + sent + ' '
        
    
    if new_sents == '  ':
        return np.nan
        
    return new_sents

def remove_stopwords(text_string):    
    # remove stopwords
    sent = ' '.join([word for word in text_string.split() if not word in spacy_stopwords])
    return sent
    
def remove_stopwords_keepnegpos(text_string):
    # remove stopwords
    # I kept words in positive and negative set
    sent = ' '.join([word for word in text_string.split() if (word not in spacy_stopwords) or (word in neg_words) or (word in pos_words)])
    return sent

In [None]:
# remove duplicated rows after preprocessing
df_neg_words = pd.read_csv('LM2018N.csv', header=None).iloc[:,0].apply(preprocess_sent)
df_neg_words = df_neg_words.drop_duplicates().dropna()
neg_words = df_neg_words.values.tolist()
df_pos_words = pd.read_csv('LM2018P.csv', header=None).iloc[:,0].apply(preprocess_sent)
# remove duplicated rows after preprocessing
df_pos_words = df_pos_words.drop_duplicates().dropna() 
pos_words = df_pos_words.values.tolist()

## Load and preprocess text

In [None]:
df_zacks = pd.read_csv('zacks_arguments.csv', encoding='latin1')

#### Create a column with the text processed but still has stopwords

In [None]:
%time df_zacks['arguments_clean_processed_w_stopwords'] =  df_zacks['arguments_clean'].apply(preprocess_sent)

CPU times: user 55.8 s, sys: 151 ms, total: 55.9 s
Wall time: 56.4 s


#### Create another column with the text processed and removing stopwords

In [None]:
%time df_zacks['arguments_clean_processed_no_stopwords'] =  df_zacks['arguments_clean_processed_w_stopwords'].apply(remove_stopwords_keepnegpos)

CPU times: user 1.92 s, sys: 4.73 ms, total: 1.92 s
Wall time: 1.95 s


In [None]:
df_zacks

Unnamed: 0,ID,report_name,ticker,report_date,arguments_clean,label,arguments_clean_processed_w_stopwords,arguments_clean_processed_no_stopwords
0,1,"Ambarella, Inc._Attachment1(2).pdf",AMBA,1/30/2018,Ambarella is well known for its market leading...,sell,ambarella be well know for its market lead hi...,ambarella know market lead high performance vi...
1,2,"Ambarella, Inc._Attachment1(2).pdf",AMBA,6/11/2018,Ambarella is well known for its market leading...,sell,ambarella be well know for its market lead hi...,ambarella know market lead high performance vi...
2,3,"Ambarella, Inc._Attachment1(3).pdf",AMBA,10/30/2018,Ambarella is making steady progress on the dev...,buy,ambarella be make steady progress on the deve...,ambarella steady progress development delivery...
3,4,"Ambarella, Inc._Attachment1(3).pdf",AMBA,11/26/2020,Ambarella is making steady progress on the dev...,sell,ambarella be make steady progress on the deve...,ambarella steady progress development delivery...
4,5,"Ambarella, Inc._Attachment1(4).pdf",AMBA,12/3/2018,Ambarella is making steady progress on the dev...,buy,ambarella be make steady progress on the deve...,ambarella steady progress development delivery...
...,...,...,...,...,...,...,...,...
438,439,Yelp Inc._Attachment1.pdf,YELP,11/17/2020,Yelp is benefiting from increasing Advertising...,buy,yelp be benefit from increase advertising rev...,yelp benefit increase advertising revenue driv...
439,440,Yelp Inc_Attachment1(5).pdf,YELP,7/8/2020,Yelp is benefiting from increasing Advertising...,sell,yelp be benefit from increase advertising rev...,yelp benefit increase advertising revenue driv...
440,441,Yelp Inc_Attachment1.pdf,YELP,10/12/2020,Yelp is benefiting from increasing Advertising...,buy,yelp be benefit from increase advertising rev...,yelp benefit increase advertising revenue driv...
441,442,Zynga Inc_Attachment1(5).pdf,ZNGA,8/7/2020,Zynga??s growth is primarily driven by streng...,sell,zynga??s growth be primarily drive by streng...,zynga??s growth primarily drive strength dive...


## Compute sentiment scores using dictionary approach

#### 3 formulas with Lemmatization, without negate:
#### (1) P/D.
#### (2) P/(P+N+1).
#### (3) (P-N)/(P+N+1).
#### D is the number of words in your document, P is the number of positive words, N is the number of Negative words.

In [None]:
# construct regular expression used for detecting the neg and pos words
pat_negwords = '|'.join(neg_words)
pat_poswords = '|'.join(pos_words)

In [None]:
# use str.count method to count occurences of words
df_zacks['num_neg_words'] = df_zacks['arguments_clean_processed_w_stopwords'].str.count(pat_negwords)
df_zacks['num_pos_words'] = df_zacks['arguments_clean_processed_w_stopwords'].str.count(pat_poswords)
df_zacks['doc_len'] = df_zacks['arguments_clean_processed_no_stopwords'].apply(lambda x: len(x.split()))

## Please Fill in the 3 Scoring Formulas and Compute the Output Requested in A2.

In [None]:
df_zacks['score_1'] = df_zacks['num_pos_words'] / df_zacks['doc_len']
df_zacks['score_2'] = df_zacks['num_pos_words'] / (df_zacks['num_pos_words'] + df_zacks['num_neg_words'] + 1)
df_zacks['score_3'] = (df_zacks['num_pos_words'] - df_zacks['num_neg_words']) / (df_zacks['num_pos_words'] + df_zacks['num_neg_words'] + 1)
#print(df_zacks['score_1'],df_zacks['score_2'],df_zacks['score_3'])

## Compute sentiment scores using dictionary approach (with negation)

#### 3 formulas with Lemmatization, with negattion:

#### (1) P/D, D is the number of words in your document. 
#### (2) P/(P+N+1).
#### (3) (P-N)/(P+N+1).

In [None]:
negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
          "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
          "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
          "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
          "werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
          "wouldn't", "rarely", "seldom", "despite", "no", "nobody"]

In [None]:
df_negate = pd.DataFrame(negate)
df_negate = df_negate.iloc[:,0].apply(preprocess_sent)
df_negate = df_negate.drop_duplicates().dropna()
negate = df_negate.iloc[:].values.tolist()
negate

[' be not ',
 ' can not ',
 ' could not ',
 ' dare not ',
 ' do not ',
 ' have not ',
 ' might not ',
 ' must not ',
 ' neither ',
 ' need not ',
 ' never ',
 ' none ',
 ' nope ',
 ' nor ',
 ' not ',
 ' nothing ',
 ' nowhere ',
 ' ought not ',
 ' shall not ',
 ' should not ',
 ' without ',
 ' will not ',
 ' would not ',
 ' rarely ',
 ' seldom ',
 ' despite ',
 ' no ',
 ' nobody ']

In [None]:
# construct regular expression used for detecting the neg and pos words with negation words in front

# for neg words
lst_negated_neg = []
for word_negation in negate:
    for word_neg in neg_words:
        
        # remove empty leading and trailing spaces
        lst_negated_neg.append(word_negation.strip() + word_neg)
        
# for pos words
lst_negated_pos = []
for word_negation in negate:
    for word_pos in pos_words:
        
        # remove empty leading and trailing spaces
        lst_negated_pos.append(word_negation.strip() + word_pos)        

pat_negated_negwords = '|'.join(lst_negated_neg)
pat_negated_poswords = '|'.join(lst_negated_pos)

In [None]:
pat_negated_poswords

'be not able |be not abundance |be not abundant |be not acclaim |be not accomplish |be not accomplishment |be not achieve |be not achievement |be not adequately |be not advancement |be not advance |be not advantage |be not advantageous |be not advantageously |be not alliance |be not assure |be not attain |be not attainment |be not attractive |be not attractiveness |be not beautiful |be not beautifully |be not beneficially |be not benefit |be not good |be not well |be not bolster |be not boom |be not boost |be not breakthrough |be not brilliant |be not charitable |be not collaborate |be not collaboration |be not collaborative |be not collaborator |be not compliment |be not complimentary |be not conclusive |be not conclusively |be not conducive |be not confident |be not constructive |be not constructively |be not courteous |be not creative |be not creatively |be not creativeness |be not creativity |be not delight |be not delightful |be not delightfully |be not dependability |be not depen

In [None]:
# use str.count method to count occurences of words
df_zacks['num_negated_neg_words'] = df_zacks['arguments_clean_processed_w_stopwords'].str.count(pat_negated_negwords)
df_zacks['num_negated_pos_words'] = df_zacks['arguments_clean_processed_w_stopwords'].str.count(pat_negated_poswords)
df_zacks['doc_len'] = df_zacks['arguments_clean_processed_no_stopwords'].apply(lambda x: len(x.split()))

In [None]:
df_zacks['num_negated_pos_words'].value_counts()

0    439
1      4
Name: num_negated_pos_words, dtype: int64

In [None]:
df_zacks['num_negated_neg_words'].value_counts()

0    443
Name: num_negated_neg_words, dtype: int64

In [None]:
# when one word is detected negated, it means the opposite should +1, while itself should - 1
df_zacks['num_pos_words_negated'] = df_zacks['num_pos_words'] + df_zacks['num_negated_neg_words'] - df_zacks['num_negated_pos_words']
df_zacks['num_neg_words_negated'] = df_zacks['num_neg_words'] + df_zacks['num_negated_pos_words'] - df_zacks['num_negated_neg_words']

In [None]:
df_zacks['score_1_negated'] = (df_zacks['num_pos_words_negated']) / df_zacks['doc_len']
df_zacks['score_2_negated'] = (df_zacks['num_pos_words_negated']) / (df_zacks['num_pos_words_negated'] + df_zacks['num_neg_words_negated'] + 1)
df_zacks['score_3_negated'] = (df_zacks['num_pos_words_negated']- df_zacks['num_neg_words_negated']) / (df_zacks['num_pos_words_negated'] + df_zacks['num_neg_words_negated'] + 1)

In [None]:
df_sortedBy_score_1 = df_zacks.sort_values('score_1', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score1, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_1['label'][:100].value_counts())
df_sortedBy_score_2 = df_zacks.sort_values('score_2', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score2, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_2['label'][:100].value_counts())
df_sortedBy_score_3 = df_zacks.sort_values('score_3', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score3, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_3['label'][:100].value_counts())
df_sortedBy_score_1_negated = df_zacks.sort_values('score_1_negated', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score1, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_1_negated['label'][:100].value_counts())
df_sortedBy_score_2_negated = df_zacks.sort_values('score_2_negated', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score2_negated, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_2_negated['label'][:100].value_counts())
df_sortedBy_score_3_negated = df_zacks.sort_values('score_3_negated', ascending=False, ignore_index=False).reset_index()
print(f'Sorting by score3_negated, the label counts of top 100 positive arguments:')
print(df_sortedBy_score_3_negated['label'][:100].value_counts())

Sorting by score1, the label counts of top 100 positive arguments:
buy     70
sell    30
Name: label, dtype: int64
Sorting by score2, the label counts of top 100 positive arguments:
buy     76
sell    24
Name: label, dtype: int64
Sorting by score3, the label counts of top 100 positive arguments:
buy     75
sell    25
Name: label, dtype: int64
Sorting by score1, the label counts of top 100 positive arguments:
buy     70
sell    30
Name: label, dtype: int64
Sorting by score2_negated, the label counts of top 100 positive arguments:
buy     76
sell    24
Name: label, dtype: int64
Sorting by score3_negated, the label counts of top 100 positive arguments:
buy     75
sell    25
Name: label, dtype: int64


# **Q1 Summary**
From the result above, we can see score2 has the best performance. While we comparing scores with negated scores, in this case, there is no difference, that is because few negated word is detected (at most one negated word in each comments, and most of the comments have no negated words), so the influence is too slight to be observed. In practical, I think negated process makes the result more accurate, while it consumes more time. It is hard to draw a conclusion whether score2 performs better than score3, because using score3, it has only one less "buy" than score2, which could be easily affected by the quality, volume of the dataset.

# **Q2**

In [None]:
df_startups = pd.read_excel('startups.xlsx',sheet_name=1)
df_startups

Unnamed: 0,ID,Company Name,Date,Description,Industry,Industry2
0,1,Enclarity Inc,2005-01-01,"Enclarity, Inc. is a United States-based healt...",Information Technology,Computer Software and Services
1,2,Ocean Entertainment Inc,2014-01-16,Ocean Entertainment Inc. is introducing the fi...,Non-High Technology,Consumer Related
2,3,Ocean Entertainment Inc,2014-01-16,Ocean Entertainment Inc. is introducing the fi...,Non-High Technology,Consumer Related
3,4,Hengyang Jinzeli Special Allop Co Ltd,1999-12-01,"Hengyang Jinzeli Special Alloy Co., Ltd. is a ...",Non-High Technology,Industrial/Energy
4,5,Verge Solutions LLC,2001-01-01,Verge Solutions LLC is a United States-based c...,Information Technology,Computer Software and Services
...,...,...,...,...,...,...
60084,60085,Knox Holding Corp Co,2018-07-11,Knox Co Holding Corp is a United States-based ...,Information Technology,Internet Specific
60085,60086,Koda Uinta Holdings LLC,2017-02-27,Koda Uinta Holdings LLC is a United States-bas...,Non-High Technology,Industrial/Energy
60086,60087,SolarWorks! Trading BV,2018-01-01,"SolarWorks! develops, manufactures and markets...",Non-High Technology,Industrial/Energy
60087,60088,Wuhan Qizhong Meiwei Technology Co Ltd,2018-06-04,"Wuhan Qizhong Meiwei Technology Co., Ltd. is a...",Information Technology,Internet Specific


In [None]:
df_startups['Description_processed_w_stopwords'] =  df_startups['Description'].apply(preprocess_sent)
df_startups['Description_processed_no_stopwords'] =  df_startups['Description_processed_w_stopwords'].apply(remove_stopwords)
# To save time, I saved the processed .csv
#df_startups.to_csv('startups_processed.csv')
#df_startups = pd.read_csv('startups_processed.csv')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time

start_time = time.time() # for tracknig time

vectorizer = TfidfVectorizer() # This is a powerful function
# see documentation https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# it includes various default data preprocessing as well
# e.g., tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')

# main step to compute TF-IDF from a list of documents
df_tfidf_desc = vectorizer.fit_transform(df_startups['Description_processed_no_stopwords'])

# create output data frame
#df_tfidf_desc_matrix = pd.DataFrame(response.toarray(),columns=vectorizer.get_feature_names())

#df_tfidf_sklearn.head()

print("--- %s seconds ---" % (time.time() - start_time))

--- 3.2794744968414307 seconds ---


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
df_tf_desc = vectorizer.fit_transform(df_startups['Description_processed_no_stopwords'])

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from lightgbm import LGBMClassifier
start_time = time.time() # for tracknig time
param_grid = {
    'n_estimators': [20,60,100,140,180],
    'max_depth': [4, 5, 6, 7],
    'learning_rate':[0.05,0.1,0.2]
}

# Perform grid search with 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tfidf_grid_search = GridSearchCV(LGBMClassifier(objective='multiclass', random_state=42), param_grid, cv=kf, scoring='accuracy',verbose=1)
tfidf_grid_search.fit(df_tfidf_desc, df_startups['Industry'])
print("--- %s seconds ---" % (time.time() - start_time))

Fitting 5 folds for each of 60 candidates, totalling 300 fits
--- 12170.401759386063 seconds ---


In [None]:
print('Report the best parameters and rmse for best result:')
print('##############################################################################################')
print('Grid Search Best Parameters: ', tfidf_grid_search.best_params_)
print('Grid Search Best accuracy: ', tfidf_grid_search.best_score_)
print('##############################################################################################')

Report the best parameters and rmse for best result:
##############################################################################################
Grid Search Best Parameters:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 180}
Grid Search Best Accuracy: 0.8718734372436518
##############################################################################################


In [None]:
from sklearn.model_selection import GridSearchCV, KFold
from lightgbm import LGBMClassifier
import time
start_time = time.time() # for tracknig time
# I found that the data volume is quite huge, the parameters like n_estimators and max_depth
# perform better while setting them larger, to verify whether my finding is correct, here I 
# tune the parameters a little bigger
param_grid = {
    'n_estimators': [100,150,200,250],
    'max_depth': [5, 6, 7, 8],
    'learning_rate':[0.05, 0.1, 0.2,0.3]
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tf_grid_search = GridSearchCV(LGBMClassifier(objective='multiclass', random_state=42), param_grid, cv=kf, scoring='accuracy', verbose=2)
tf_grid_search.fit(df_tfidf_desc, df_startups['Industry'])
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
print('Report the best parameters and rmse for best result:')
print('##############################################################################################')
print('Grid Search Best Parameters: ', tf_grid_search.best_params_)
print('Grid Search Best Accuracy: ', tf_grid_search.best_score_)
print('##############################################################################################')

Report the best parameters and rmse for best result:
##############################################################################################
Grid Search Best Parameters:  {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 250}
Grid Search Best Accuracy:  0.8728220393051929
##############################################################################################


# **Q2 Summary**

I found that the data volume is quite huge, the parameters like n_estimators, max_depth and learning_rate all perfer to be larger. So I guess the best parameter could be a little larger than I used to set. To verify whether my guessing, in the second grid search, I tuned the parameters a little bigger. From above we can see, the learning rate keeps at 0.2, the max_depth keeps at 7, which shows they are likely to be the best or local best, as for n_estimators, it seems that it can still go bigger. Due to the limited time (as it took me **3 hours** for a grid search), I didn't make further attempts.

# **Q3**

In [None]:
from collections import Counter
# I tried to use functions below to find those words that are not that usefule for clustering
# Those words appear in half of the topic's top 15 words. I used tfidf as vector, tried
# with limitation max_features = 1000 and without limitation, then picked uselesswords as below:
'''
uselesskeywords = ['commercialize', 'server', 'protocol', 'banking', 'company', 'service',\
                'provide', 'base', 'product', 'technology', 'carbon', 'interest',\
                '10', 'play', 'player', 'identity', 'profile', 'produce',\
                'card', 'east', 'switch']
'''
uselesskeywords = ['company',
                  'product',
                  'offer',
                  'include',
                  'base',
                  'united',
                  'service',
                  'provide',]
# Function is as below
def common_words(sorted_words, nTop):
# **parameters** sorted_words: sorted_words is the most frequent words of the result of LDA 
# **parameters** nTop: Select how many top words to detect, for example, if I select 15, and the length
# of the sorted_words is 30, then it judges whether the word is the top 15 words of other topics.
# **return** : A common wordslist that (moer than)half of the topic have the words as nTop words.
  ntopics = len(sorted_words)
  nwords = len(sorted_words[0])
  wordslist = []
  for i in sorted_words:
    if nwords > nTop:
      wordslist = wordslist + i[:nTop]
    else:
      wordslist = wordslist + i
  counts = Counter(wordslist)
  common_wordslist = []
  for k,v in counts.items():
    if v > ntopics // 2:
      common_wordslist.append(k)
  return common_wordslist

def remove_uselesskeywords(text_string, uselesskeywords=uselesskeywords):    
    # Function to remove uselesskeywords
   
    sent = ' '.join([word for word in text_string.split() if not word in uselesskeywords])
    return sent
df_startups['Description_processed_no_stopwords_rm_uselesskeywords'] = df_startups['Description_processed_no_stopwords'].apply(remove_uselesskeywords)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import time

def topic2Label(numtopic, topics, indices):
  # This function is used for checking what label the topic most likely correspond to.
  # It firstly counts how many documents of a fixed lable are allocated to topic[i], then
  # it regards the topic which owns the most documents as the label.
  # **parameters** numtopic: How many topics are there.
  # **parameters** topics: topics list, topic[i] means document i is topic[i]
  # **parameters** indices: the origin index documents with aimed label. For example, the indices of documents
  # that are 'Information Technology' in Industry1.[0,4,...]
  # **return**: returns the topic No. that the most likely correspond to the label
  res = [0 for _ in range(numtopic)]
  for i in indices:
    res[topics[i]] += 1
  for i in range(len(res)):
    print(f'There are {res[i]} documents in topic[{i}].')
  return np.argmax(np.array(res))

def LDAprocess(n_components, max_features, documents, indusrtry, industryLabel):
  # This is the whole process, including tfidf matix generating, lda model fiting, and calculate precision and recall.
  # **parameters** n_components: number of topics
  # **parameters** max_features: how many features of tf-idf you want to keep
  # **parameters** documents: the input text for text modeling
  # **parameters** industry: Industry1 or Industry2
  # **parameters** industryLabel: What exact is the industry that we want to learn
  
  start_time = time.time()

  vect =TfidfVectorizer(max_features=max_features) 
  vect_text=vect.fit_transform(documents)

  # the following is the key function for the basic LDA
  lda_model=LatentDirichletAllocation(n_components=n_components,random_state=88,n_jobs=3) # n_jobs could be important now because LDA is slow
  # parameters:
  # n_components is the number of topics
  # doc_topic_priorfloat, default=None => you can specify the prior probability of topics
  # topic_word_priorfloat, default=None => you can specify the prior probability of words
  # learning_method{‘batch’, ‘online’}, default=’batch’: if the data size is large, the online update will be much faster than the batch update.

  # the training step 
  lda_top=lda_model.fit_transform(vect_text)


  vocab = vect.get_feature_names_out()

  for i, comp in enumerate(lda_model.components_):
      vocab_comp = zip(vocab, comp)
      sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
      print("Topic "+str(i)+": ")
      for t in sorted_words:
          print(t[0],end=" ")
      print("\n")

  # Create Document-Topic Matrix that is much easier to browse in excel
  lda_output = lda_model.transform(vect_text)

  # column names
  topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]

  # index names
  docnames = ["Doc" + str(i) for i in range(len(df_startups))]

  # Make the pandas dataframe
  df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

  # Get dominant topic for each document
  dominant_topic = np.argmax(df_document_topic.values, axis=1)
  df_document_topic['dominant_topic'] = dominant_topic

  Industry_indices = df_startups.loc[df_startups[indusrtry] == industryLabel].index.tolist()

  topicNo = topic2Label(n_components, dominant_topic.tolist(), Industry_indices) #计算各个topic有多少个（该选中的industry的）documents中，最多的就对应这个industry
  print(f'The most likely topic is topic[{topicNo}]')
  tp = 0
  fp = 0
  fn = 0

  for i in range(df_startups.shape[0]):
    if dominant_topic[i] == topicNo:
      if df_startups[indusrtry][i] == industryLabel:
        tp += 1
      else:
        fp += 1
    elif df_startups[indusrtry][i] == industryLabel:
      fn += 1
  print(f'tp:{tp},fp:{fp},fn:{fn}')
  lda_precision = tp/(tp+fp+1)
  lda_recall = tp / (tp+fn+1)

  print(f'{n_components} Topics: precision = {lda_precision}, recall = {lda_recall}.')

  print("\n--- Function LDAprocess takes %s seconds ---" % (time.time() - start_time))

In [None]:
# 3 topics, tf-idf max_features = 1000, text without removing useless keywords
LDAprocess(3,1000,df_startups['Description_processed_no_stopwords'], 'Industry', 'Information Technology')

Topic 0: 
company medical develop product health care treatment disease drug patient 

Topic 1: 
company service product engage co ltd china base include technology 

Topic 2: 
company service software platform provide solution application datum user mobile 

There are 2182 documents in topic[0].
There are 7488 documents in topic[1].
There are 26268 documents in topic[2].
The most likely topic is topic[2]
tp:26268,fp:2803,fn:9670
3 Topics: precision = 0.9035498073747936, recall = 0.7309051448287376.

--- Function LDAprocess takes 361.04467940330505 seconds ---


In [None]:
# 3 topics, tf-idf max_features = 1000, text  removes useless keywords
LDAprocess(3,1000, df_startups['Description_processed_no_stopwords_rm_uselesskeywords'], 'Industry', 'Information Technology')

Topic 0: 
engage ltd co china operate manufacture energy equipment mainly business 

Topic 1: 
develop medical health disease care treatment drug patient game inc 

Topic 2: 
software solution management platform datum application mobile business user network 

There are 7445 documents in topic[0].
There are 5496 documents in topic[1].
There are 22997 documents in topic[2].
The most likely topic is topic[2]
tp:22997,fp:3189,fn:12941
3 Topics: precision = 0.8781838316722038, recall = 0.6398898132947495.

--- Function LDAprocess takes 353.20115208625793 seconds ---


In [None]:
# 3 topics, tf-idf max_features = None, text without moving useless keywords
LDAprocess(3,None,df_startups['Description_processed_no_stopwords'], 'Industry', 'Information Technology')

Topic 0: 
company service provide base platform solution business product software offer 

Topic 1: 
company develop medical technology system product base health device solution 

Topic 2: 
election kubernete payday pino lunar nosql elasticsearch fs chitosan pantheon 

There are 29586 documents in topic[0].
There are 6339 documents in topic[1].
There are 13 documents in topic[2].
The most likely topic is topic[0]
tp:29586,fp:15240,fn:6352
3 Topics: precision = 0.660004015437125, recall = 0.8232282478644369.

--- Function LDAprocess takes 365.9847950935364 seconds ---


In [None]:
# 3 topics, tf-idf max_features = None, text removes useless keywords
LDAprocess(3,None, df_startups['Description_processed_no_stopwords_rm_uselesskeywords'], 'Industry', 'Information Technology')

Topic 0: 
engage co ltd china energy operate manufacture food gas equipment 

Topic 1: 
drug disease medical develop treatment cancer clinical cell development research 

Topic 2: 
solution software platform management business application mobile datum system user 

There are 4255 documents in topic[0].
There are 596 documents in topic[1].
There are 31087 documents in topic[2].
The most likely topic is topic[2]
tp:31087,fp:6838,fn:4851
3 Topics: precision = 0.8196751568844592, recall = 0.8649934611424914.

--- Function LDAprocess takes 344.4106903076172 seconds ---


In [None]:
# 10 topics, tf-idf max_features = 1000, text without moving useless keywords
LDAprocess(10,1000,df_startups['Description_processed_no_stopwords'], 'Industry2', 'Computer Software and Services')

Topic 0: 
company product food offer include brand operate online base store 

Topic 1: 
china co ltd engage mainly principally company technology service development 

Topic 2: 
wireless network company system power energy communication solution service solar 

Topic 3: 
company game video mobile user content online medium social platform 

Topic 4: 
company disease drug develop treatment medical cancer patient clinical therapeutic 

Topic 5: 
company service provide health payment insurance platform offer care base 

Topic 6: 
datum company software solution security platform provide intelligence management cloud 

Topic 7: 
service software company management solution marketing customer business provide application 

Topic 8: 
company product gas manufacture material equipment water system oil base 

Topic 9: 
company service investment property management estate operate provide financial asset 

There are 219 documents in topic[0].
There are 956 documents in topic[1].
There are 501

In [None]:
# 10 topics, tf-idf max_features = 1000, text removes useless keywords
LDAprocess(10,1000,df_startups['Description_processed_no_stopwords_rm_uselesskeywords'], 'Industry2', 'Computer Software and Services')

Topic 0: 
online operate accessory student brand school travel home car france 

Topic 1: 
drug disease develop cancer treatment cell therapeutic development pharmaceutical clinical 

Topic 2: 
network wireless solution communication internet mobile software system develop application 

Topic 3: 
mobile game video content user online medium platform social application 

Topic 4: 
software management solution datum platform cloud application states customer business 

Topic 5: 
china co ltd engage mainly principally business provision development beijing 

Topic 6: 
manufacture system equipment material industrial power use design industry manufacturing 

Topic 7: 
food loan restaurant credit online financial card operate business payment 

Topic 8: 
medical health care patient healthcare device hospital treatment system states 

Topic 9: 
energy gas investment oil project property operate management asset estate 

There are 454 documents in topic[0].
There are 106 documents in topic[1]

In [None]:
# 10 topics, tf-idf max_features = None, text without moving useless keywords
LDAprocess(10,None,df_startups['Description_processed_no_stopwords'], 'Industry2', 'Computer Software and Services')

Topic 0: 
company product gas oil energy include water base manufacture engage 

Topic 1: 
drug disease therapeutic develop company cancer treatment cell development clinical 

Topic 2: 
sdh mram microstructure sc woom kaishan gre kiln ringback crowdsourcing 

Topic 3: 
chocolate cookie snack flavor butter coconut potato sweet nut chip 

Topic 4: 
chicken food meal restaurant salad beef meat recipe menu pork 

Topic 5: 
company system technology product network wireless solution service develop mobile 

Topic 6: 
reviewer craftsman rollover nudge recordkeeping payphone lubricate bookmarking afs aldehyde 

Topic 7: 
company service provide platform software solution management base business offer 

Topic 8: 
medical company care health patient product base device diagnostic technology 

Topic 9: 
gan tft backlight sic accidental oled boat hud manganese cue 

There are 157 documents in topic[0].
There are 47 documents in topic[1].
There are 2 documents in topic[2].
There are 4 documents 

In [None]:
# 10 topics, tf-idf max_features = 1000, text removes useless keywords
LDAprocess(10,None,df_startups['Description_processed_no_stopwords_rm_uselesskeywords'], 'Industry2', 'Computer Software and Services')

Topic 0: 
gas operate oil energy property investment home project estate engage 

Topic 1: 
datum software platform security solution cloud management application intelligence customer 

Topic 2: 
solution management software business network system platform internet application financial 

Topic 3: 
online mobile platform game video content user medium application social 

Topic 4: 
intracellular ivf infertility li pet fertility fertilization hpv egg aptamer 

Topic 5: 
potash cashless creamer lunar jive clot kanban plywood visualisation spiral 

Topic 6: 
drug disease develop medical treatment cancer development cell therapeutic research 

Topic 7: 
health care patient healthcare medical insurance management solution platform software 

Topic 8: 
manufacture food equipment engage material ltd power co energy system 

Topic 9: 
mmorpg racing mmo massively oculus korean multiplayer tel tonic expressway 

There are 216 documents in topic[0].
There are 4399 documents in topic[1].
There a

In [6]:
res_columns = ["No. of topics","max_features", "remove useless keywords","precision", "recall"]
res_Matrix = [[3,1000,'No', 0.8519653743810839, 0.70380366732519],
              [3,1000,'Yes', 0.870909757887014, 0.6605915579175826],
              [3,'None','No', 0.7719800034819807, 0.8636578647152119],
              [3,'None','Yes', 0.6460593140635776, 0.5734160661120231],
              [10,1000,'No', 0.5061099796334012, 0.2896270396270396],
              [10,1000,'Yes', 0.6994556765163297, 0.34945609945609946],
              [10,'None','No',0.3763283438927733, 0.9653587153587153],
              [10,'None','Yes', 0.3507372492144066, 0.46976171976171976],
              ]
res_Matrix = pd.DataFrame(res_Matrix,columns=res_columns)
print("Table 1: LDA model Random_state = 42:")
res_Matrix

Table 1: LDA model Random_state = 42:


Unnamed: 0,No. of topics,max_features,remove useless keywords,precision,recall
0,3,1000.0,No,0.851965,0.703804
1,3,1000.0,Yes,0.87091,0.660592
2,3,,No,0.77198,0.863658
3,3,,Yes,0.646059,0.573416
4,10,1000.0,No,0.50611,0.289627
5,10,1000.0,Yes,0.699456,0.349456
6,10,,No,0.376328,0.965359
7,10,,Yes,0.350737,0.469762


# **Q3 Summary**

In this question, I used tf-idf as input text vectors. While I was setting the number of max_features, firstly I followed the sample code (max_features=1000), then I tried using all features to see which one is better. 


After that, I also attempted to remove some useless key words, I regard those key words that appeared in more than half of the  topics (I defined 'appear' as a word is in the top 15 of a topic's key words.) as useless key words because if most topic contains those words, that means it have few contribution helping with recognizing the topic. 

In addition, I have tried two random_states of LDA model, my research is based on the random_state=42(**Table 1**), but finally, because the performance of random_state=88 is better, I kept the result of randoms_state=88(**Table 2**).

Now let's take a look at the summary table above.

We can see while we use 3 topics, the precision is higher than 10 topics, that is because while we have 10 topics, some topics may close to other topics. If we only have 3 topics, it is much harder to give the wrong label as there is only limited but various labels.

In both 3 and 10 topics modeling, when we only use limited features, the precision is much better. That is because if we only need to build these topics, 1000 features is enough, the less important features may perform like noise, as a result, they have some sort of negative effect on the model.

Does remove the useless key words help? My answer is "When the precision is not good enough, removing key words is helpful to some extend(See line 5 and line 6 of **Table 1**). However, if the performance is good enough, then it doesn' help much(See line 1 and line 2 of **Table 1**), and it can even make negative contribution since the original performance is very good(See line 1 and line 2, line 5 and line 6 of **Table 2**)". 

We can see, in the 3-topic modeling, remove the useless keywords does not help. The reason is that the original precision is good enough in 3-topic modeling, it is hard to improve via simply removing some keywords, most of the documents have been labeled correctly, the wrong labled documents themselves may be harder to be classified. And for some documents, if we remove some of the keywords, then it may lose some features, then be misclassified. However, for the 10-topic modeling, it has more topics, so the result is more sensitive to the accuracy of the topic description, or in other words, they need the key words to be more identifiable, removing those common keywords does help, especially when the original precision is not good enough(line 5 and line 6 of **Table 1**). Additionally, the my method of selecting useless keywords may need to be improved, the quality of the useless keywords may have some infulence.

As for recall, I found that with unlimited features and do not remove useless keywords, the recall goes extremely high(line 7 of both **Table** 1 and **Table 2**). I checked the data of **Table 1**, and found that in these cases, most documents are allocated to one topic, that makes such a strange phenomenon happen. My deduction is that those extreme topic contains many common and frequent words. For example, let's take look at the classification result of line7 of **Table 1**:

Actually, there are 36 documents in topic[0].

Actually, there are 4 documents in topic[1].

Actually, there are 2 documents in topic[2].

Actually, there are 4 documents in topic[3].

Actually, there are 4 documents in topic[4].

Actually, there are 14909 documents in topic[5].

Actually, there are 1 documents in topic[6].

Actually, there are 63 documents in topic[7].

Actually, there are 73 documents in topic[8].

Actually, there are 347 documents in topic[9].

Almost all 'Computer Software and Services' documents are sent to topic[5]. However, when we take a look at the contents:


Topic 5: 
10 stage east carbon card play player identity professional point 

We found that those are the common words, which I detected and later removed in the extra processing. Although the recall rate is quite high, I regarded it as unreasonable, and after I removed the keywords, the recall rate drops to 0.47 which is more reasonable.

**In summary, using 1000 max_features of tf-idf is enough for modling, the unlimited featrues could introduce significant noise that has negative effect on the performance. And if the original performance is not good enough, we can try remove useless keywords to help improve the performance. It does not always help, and the quality of the useless keywords list will affect the final result.**







In [5]:
res_columns = ["No. of topics","max_features", "remove useless keywords","precision", "recall"]
res_Matrix_randomstate88 = [
              [3,1000,'No', 0.9035498073747936, 0.7309051448287376],
              [3,1000,'Yes', 0.8781838316722038, 0.6398898132947495],
              [3,'None','No', 0.660004015437125, 0.8232282478644369],
              [3,'None','Yes', 0.8196751568844592, 0.8649934611424914],
              [10,1000,'No', 0.6800712649529143, 0.346024346024346],
              [10,1000,'Yes', 0.6612659503022162, 0.51003626003626],
              [10,'None','No',0.39804227049589935, 0.8767806267806267],
              [10,'None','Yes', 0.30591304347826087, 0.3416860916860917],
              ]
              

res_Matrix_randomstate88 = pd.DataFrame(res_Matrix_randomstate88,columns=res_columns)
print("Table 2: LDA model Random_state = 88:")
res_Matrix_randomstate88

Table 2: LDA model Random_state = 88:


Unnamed: 0,No. of topics,max_features,remove useless keywords,precision,recall
0,3,1000.0,No,0.90355,0.730905
1,3,1000.0,Yes,0.878184,0.63989
2,3,,No,0.660004,0.823228
3,3,,Yes,0.819675,0.864993
4,10,1000.0,No,0.680071,0.346024
5,10,1000.0,Yes,0.661266,0.510036
6,10,,No,0.398042,0.876781
7,10,,Yes,0.305913,0.341686
