In [1]:
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import os
from nltk.corpus import stopwords

# lda
import lda

# Gsdmm
from gsdmm import MovieGroupProcess


# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize
from pprint import pprint

# gsdmm
from gsdmm import MovieGroupProcess


import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocessing():
    directory = 'comments1k'
    comments = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename)) as file:
                comments.append(file.read().strip())
                filenames.append(filename)
    df = pd.DataFrame({'Filename': filenames, 'comments': comments})

    df.head()
    df['comments'] = df['comments'].str.replace('&\w+;'," ")
    df['comments'] = df['comments'].apply(lambda x: re.sub('<.*?>','', x))

    df['comments'] = df['comments'].str.lower()

    df['comments'] = df['comments'].str.replace('[^\w\s]',' ')
    # Load the stop words from NLTK
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    df['comments'] = df['comments'].apply(lambda x: " ".join([w for w in x.split() if w not in stop_words]))

    return df


In [3]:
def matrix_1(df):
    docs = df['comments'].values.tolist()
    vectorizer = CountVectorizer()
    bag_of_words = vectorizer.fit_transform(docs)
    x = bag_of_words.toarray()
    return x

In [4]:
def create_vocab(df):
    docs = df['comments'].values.tolist()
    vectorizer = CountVectorizer()
    new_vocab = vectorizer.fit(docs).get_feature_names_out()
    return new_vocab

In [5]:
def lda(df):
    import lda
    df1 = df
    matrix = matrix_1(df)
    vocab = create_vocab(df)
    titles = df['comments'].tolist()
    model = lda.LDA(n_topics=10, n_iter=1500, random_state=1)
    model.fit(matrix) 
    topic_word = model.components_
    n_top_words = 8
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    doc_topic = model.doc_topic_
    data = []
    docs = df.shape[0]
    for i in range(docs):
        filename = df1['Filename'][i]
        title = titles[i]
        top_topic = doc_topic[i].argmax()
        data.append({'filename': filename,'Title': title, 'Top Topic': top_topic})
        final_df= pd.DataFrame(data)
    return final_df

In [None]:
def 

<b> Question 1.1 - Use  Latent  Dirichlet  Allocation  (LDA)  method  to  discover  latent  topics  in  the  dataset  with  the number  of  topics  as  10.  Output  the  top  8  words  for  each  topic.  For  the  document  “0_9.txt”  and “1_7.txt”, what topics are assigned to them? Do they make sense?</b>

In [7]:
df = preprocessing()
X = lda(df)
X.to_csv('output/lda.csv', index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saima_x4lzx52\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:lda:n_documents: 996
INFO:lda:vocab_size: 15901
INFO:lda:n_words: 121324
INFO:lda:n_topics: 10
INFO:lda:n_iter: 1500
INFO:lda:<0> log likelihood: -1439176
INFO:lda:<10> log likelihood: -1163684
INFO:lda:<20> log likelihood: -1138386
INFO:lda:<30> log likelihood: -1125362
INFO:lda:<40> log likelihood: -1116881
INFO:lda:<50> log likelihood: -1111641
INFO:lda:<60> log likelihood: -1107484
INFO:lda:<70> log likelihood: -1103336
INFO:lda:<80> log likelihood: -1100923
INFO:lda:<90> log likelihood: -1097638
INFO:lda:<100> log likelihood: -1095897
INFO:lda:<110> log likelihood: -1093709
INFO:lda:<120> log likelihood: -1091735
INFO:lda:<130> log likelihood: -1090065
INFO:lda:<140> log likelihood: -1087694
INFO:lda:<150> log likelihood: -1087355
INFO:lda:<160> log likelihood: -1085475
INFO:lda:<170> log likelihood: -1083549
I

Topic 0: brosnan man david robert life brother river fantasy
Topic 1: stewart jeff ned james gannon kelly western john
Topic 2: film one story two life man well character
Topic 3: war world young miike yokai kids film school
Topic 4: game carla chess paul french luzhin alexandre read
Topic 5: star series show luke wars episode new battle
Topic 6: school high ramones matthau burns rock best comedy
Topic 7: christmas scrooge one scott von version europa trier
Topic 8: movie one like good see film great really
Topic 9: davies great show comedy people marion star price


In [8]:
X

Unnamed: 0,filename,Title,Top Topic
0,0_9.txt,bromwell high cartoon comedy ran time programs...,6
1,100_7.txt,scott bartlett offon nine minutes pure crazine...,2
2,101_8.txt,imdb lists 1972 reason sources seen including ...,8
3,102_10.txt,first heard film 20 years ago kid grade school...,8
4,103_7.txt,read comment decided watch movie first cast sp...,8
...,...,...,...
991,997_7.txt,agree posts comedy drama leaned little much to...,8
992,998_7.txt,really interesting movie action movie comedy m...,8
993,999_10.txt,amazed movie others average 5 stars lower crap...,8
994,99_8.txt,christmas together actually came time raised j...,8


In [9]:
X.iloc[0]

filename                                               0_9.txt
Title        bromwell high cartoon comedy ran time programs...
Top Topic                                                    6
Name: 0, dtype: object

In [10]:
X.iloc[111]

filename                                               1_7.txt
Title        like adult comedy cartoons like south park nea...
Top Topic                                                    8
Name: 111, dtype: object

1.1 Answer:

- Topic 0 is about Pierce Brosnan because it contains words related to his life and career such as "man", "david", "robert", "life", "brother", "river" It seems. It also contains words related to his imaginary life, such as "fantasy" and "dream".
- Topic 1 seems to be related to actors because it contains words related to the career of actors such as "movie", "one", "story", "two", "life", "man", and "person". Also included are words related to her personal life, such as Jeff, Ned, James, Kelly, Western, and John.
- Topic 2 appears to be about movies because it contains words related to making movies, such as "movie," "one," "story," and "two." It also includes words related to personality and behavior, such as "man," "nice," and "personality."
- Topic 3 seems to be related to Japanese culture because it contains words related to movies such as "miike", "yokai", "kids", "film", and "school". It also includes words related to her game of chess, such as "carla", "chess", "paul" and "french".
- Topic 4 appears to be about chess because it contains game-related words such as "carla", "chess", "paul", and "french". It also contains words related to athletes, such as "Luzin" and "Alexandre". 
- Topic 5 seems to be centered around the Star Wars series, as it includes words related to characters and plots such as "Luke," "Wars," "Episode," and "Battle."
- Topic 6 seems to revolve around high school, as it contains words related to the experience such as "Ramones", "Mathaw", "Burns", "Rock", and "Best". There are also words related to students such as "comedy" and "marion".
- Topic 7 seems to center around the movie A Christmas Carol as it contains words related to the story such as 'Scrooge', 'One', 'Scott', 'Von' and 'Europa'. I can see it. Words related to characters such as "Christmas" and "ghost" are also included. 
- Topic 8 seems to be about movies because it contains words about movie quality, such as "1", "like", "good", and "watch". It also includes words related to entertainment, such as "movie", "great", and "really".
- Topic 9 appears to be centered around the comedy show Davies, as it contains words related to the show such as "great", "show", "comedy", "people", "Marion", and "star". looks like And so on, words related to the cast are also included. B. "Price". 




**Question 1.2: Because of the data sparsity, short text may not  provide  enough  context  to  adequately  inform  topic modeling.Try  Biterm,  GSDMM  or  other  short  text  topic  model  for  our  dataset.  Compare  the  topic modelling results with LDA, any improvement?**

#### Using GSDMM

In [13]:
df = preprocessing()
Y = biterm(df)
Y.to_csv('output/biterm.csv', index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saima_x4lzx52\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In stage 0: transferred 888 clusters with 3 clusters populated
In stage 1: transferred 12 clusters with 2 clusters populated
In stage 2: transferred 0 clusters with 2 clusters populated
In stage 3: transferred 0 clusters with 2 clusters populated
In stage 4: transferred 0 clusters with 2 clusters populated
Number of documents per topic : [702   0   0   0   0   0   0   0   0 294]
Most important clusters (by number of docs inside): [0 9 8 7 6 5 4 3 2 1]

Cluster 0 : [(0, 11118343), (1, 39049), (2, 3847), (3, 829), (4, 271), (5, 86), (6, 35), (7, 26), (8, 8), (9, 5), (11, 2), (10, 1)]

Cluster 9 : [(0, 4620527), (1, 45284), (2, 6027), (3, 1687), (4, 615), (5, 299), (6, 171), (7, 93), (8, 65), (9, 36), (10, 27), (11, 18), (13, 10), (12, 8), (14, 5), (15, 4), (17, 4), (18, 3), (20, 3), (16, 2)]

Cluster 8 : []

Cluster 7 : []

Cluster 6 : []

Cluster 5 : []

Cluster 4 : []

Cluster 3 : []

Cluster 2 : []

Cluster 1 : []


AttributeError: 'NoneType' object has no attribute 'to_csv'

In [None]:
Y

<b> Question 2.1 When there is no (enough) labelled corpus to train a machine learning based NLP model, we need to 
create a training text dataset as golden standard through manual annotation. Choose a text annotation 
tool to finish the following two text annotation tasks: 
 
<i> Entity  Annotation:  “Barack  Obama  was  the  44th  President  of  the  United  States.  He  was  born  in 
Hawaii and studied law at Harvard University.”  
Annotation Results:  
      Barack Obama PERSON 
      44th CARDINAL 
      the United States GPE 
      Hawaii GPE 
      Harvard University ORG </i>
 
<i> Sentiment Annotation: “De Niro has the ability to make every role he portrays into acting gold. He 
gives a great performance in this film and there is a great scene where he has to take his father to a 
home for elderly people because he can't care for him anymore that will break your heart.  I will say 
you won't see much bette acting anywhere.”  
Annotation Results: Positive </i>
 </b>
 

**Answer** 
#### For this task, I used label studio to perform annotation.

- Label Studio: It is an open-source application for data annotation that supports a variety of annotation kinds, including object identification, named entity recognition, and text classification.
- Source: https://github.com/heartexlabs/label-studio/
- We can install label-studio in anaconda environment, by below steps:

- step 1: conda create --name label-studio
- step 2: conda activate label-studio
- step 3: pip install label-studio

<i>After installation you can navigate to the google chrome browser and signup for a new account.
<i> Click on the New project on landing page.

<img src="output/2-1/1.png"/>


<i> You will get below popup window and Enter the project name

<img src = "output/2-1/2.png"/>

<i>Now Click on the Data import and import the desired text file

<img src ="output/2-1/3.png"/>

<i> Now click on the Labeling setup and select the <b>Natural Language Processing</b> and select the <b>Named Entity Recognition</b> tab.

<img src = "output/2-1/entity.png"/>

 - <i> You will be navigated to new page and Remove all the pre existing labels and <b>add desired labels</b>
 - <i> Select the Configure date as import file from the dropdown.
 - <i> After completion of two steps click the Save button.</i>

<img src= "output/2-1/4.png"/>

<i>After clicking on the save button, You will be redirected to the new page to labeling the text.
<img src = "output/2-1/new.png">

<i>Select the ID for the label You will be redirected to new page for labeling.
<b>Follow this steps for labeling:
    Select the label and highlight the desired text and click enter</b>

<img src = "output/2-1/k.png">
<b>Now repeat all the steps for the remaining labels</b>
<img src = "output/2-1/5.png">

**Repeat the same process for sentiment annotation for upto data importing**

<img src ="output/2-1/6.png">

<img src = 'output/2-1/7.png'>

<i> Now, select the labeling method, select the <b> Natural Language Processing </b> and select the <b> text classification </b>

<img src= 'output/2-1/text-cla.png'>

<i> After selecting, you will be redirected to the next page. </i> 
<i> Click on the <b>Save</b> button. </i>

<img src="output/2-1/8.png">

<i> You will be redirected to the new page, Now select the ID and Select the type of the <b>sentiment</b>
<i> Click on the <b> submit </b> button to save the annotation </i>

<img src = "output/2-1/9.png">

<b> Thus, Entity and Sentiment Annotation is done by label-studio. </b>