# preprocessing

## 1.0 Create tokenizer

In [14]:
from nltk import pos_tag, PorterStemmer, NaiveBayesClassifier
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet

# def get_wordnet_pos(treebank_tag):
# 
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN

# tokenize words
# remove stop words
# stemming
stop_words = set(stopwords.words('english'))
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = PorterStemmer()
        self.tokenizer = RegexpTokenizer(r'\b(?:(?!www\.|http|\.com)[a-z]{2,})\b')
    def __call__(self, articles):
        articles = articles.lower()
        tokens = [self.wnl.stem(x) for x in self.tokenizer.tokenize(articles) if x not in stop_words]
        return tokens

tfidf_vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None)
bow_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), token_pattern=None)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## 2.0 Word2Vec

In [15]:
from nltk import sent_tokenize
import time
import json
import glob
import os

# Read each JSONL file and append to the list
def create_sentences():
    for file in glob.glob('data/word2vec/*.jsonl'):

        print("Processing file: " + file)
        sentences = set()
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    if 'timestamp' in data and 'text' in data:
                        timestamp = pd.to_datetime(data['timestamp'], unit='ms')
                        if timestamp < pd.Timestamp('2019-01-01'):
                            sentences.update(set(sent_tokenize(data['text'])))
                except json.JSONDecodeError:
                    print(f"Skipping bad line in file {file}")
    
        with open('data/word2vec_processed/sentences.txt', 'a', encoding='utf-8') as f:
            for sentence in sentences:
                if len(sentence) > 2:
                    f.write(sentence + '\n')

if not os.path.exists('data/word2vec_processed/sentences.txt'):
    create_sentences()

Train Word2Vec model

In [16]:
from gensim.models import Word2Vec
# Skip-gram model (sg = 1)
size = 500
window = 3
min_count = 10
workers = 14
sg = 1
# ========= Train the Word2Vec model =========
word2vec_model_file =  'models/word2vec_' + str(size) + '.model'
print("Training Word2Vec Model")
start_time = time.time()

# Stream the reviews one by one
class MyCorpus:
    def __iter__(self):
        with open('data/word2vec_processed/sentences.csv', 'r') as f:
            for line in f:
                print(line)
                print(LemmaTokenizer()(line))
                yield LemmaTokenizer()(line)
            
# Train the Word2Vec Model
sentences = MyCorpus()
w2v_model = Word2Vec(sentences, min_count = min_count, vector_size = size, workers = workers, window = window, sg = sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Training Word2Vec Model


FileNotFoundError: [Errno 2] No such file or directory: 'data/word2vec_processed/sentences.csv'

## 3.0 W2VLDA

### Create embedding (both global and target domain)

In [None]:
blabla

### Create brown clusters

In [32]:
from brown_clustering import BigramCorpus, BrownClustering
from nltk import sent_tokenize
import pandas as pd

df = pd.read_csv('data/amazon_reviews.txt', sep='\t')

reviews = df['REVIEW_TEXT'].tolist()
sentences = [sent_tokenize(review) for review in reviews]
flat_sentences = [LemmaTokenizer()(sentence) for sublist in sentences for sentence in sublist]
# remove empty sentences
flat_sentences = [sentence for sentence in flat_sentences if sentence]

# create a corpus
corpus = BigramCorpus(flat_sentences, alpha=0.9, min_count=1)
corpus.ranks()
# create a clustering
clustering = BrownClustering(corpus, m=500) # m is the number of clusters

# train the clustering
clusters = clustering.train()

100%|██████████| 21351/21351 [10:16<00:00, 34.65it/s]


In [33]:
    print(*clusters, sep='\n')

['br']
['use']
['one']
['great']
['like']
['work']
['good']
['love']
['get']
['look']
['product']
['would']
['realli']
['well']
['time']
['qualiti']
['make']
['price']
['also']
['need']
['much']
['easi']
['recommend']
['nice']
['buy']
['littl']
['bought']
['want']
['go']
['even']
['fit']
['purchas']
['tri']
['got']
['thing']
['set']
['light']
['made']
['year']
['day']
['back']
['watch']
['take']
['better']
['first']
['size']
['put']
['bag']
['come']
['lot']
['perfect']
['still']
['way']
['feel']
['see']
['think']
['could']
['color']
['game']
['two']
['play']
['small']
['keep']
['best']
['sound']
['review']
['seem']
['order']
['help']
['new']
['book']
['long']
['know']
['give']
['say']
['right']
['differ']
['problem']
['expect']
['case']
['last']
['find']
['batteri']
['bit']
['tv']
['enough']
['old']
['pretti']
['mani']
['around']
['read']
['sinc']
['amazon']
['happi']
['hold']
['someth']
['month']
['wear']
['pictur']
['without']
['far']
['anoth']
['item']
['everi']
['start']
['comfort'

In [20]:
import json
codes = clustering.codes()

with open('data/brown-clusters.txt', 'w') as file:
    for key, value in codes.items():
        file.write(f'{value} {key}\n')

### Find topics with LDA

In [3]:
lda

NameError: name 'lda' is not defined

### Combine all in W2VLDA

In [4]:
asdasd

NameError: name 'asdasd' is not defined