In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
df = pd.read_csv('Data/cleaned_data_2.csv')
df.head()

Unnamed: 0,Head,Body,Tags,Text,Tags Count,Text_Cleaned
0,brain segmentation to 3d model,my goal is to take a dataset of brain tumor s...,"['computer-vision', 'python']",brain segmentation to 3d model my goal is to ...,2,brain segmentation 3d model goal dataset brain...
1,active learning regression with random forest,i have a dataset of about 8k points and i am ...,"['machine-learning', 'regression', 'uncertaint...",active learning regression with random forest ...,5,active learn regression random forest dataset ...
2,comparing reinforcement learning models,i am currently completing my thesis on optimi...,"['reinforcement-learning', 'policy-gradients',...",comparing reinforcement learning models i am ...,4,compare reinforcement learning model currently...
3,why good model that performs great on holdout ...,i have this binary regression model that has ...,"['deep-learning', 'deep-neural-networks', 'pre...",why good model that performs great on holdout ...,5,good model perform great holdout validation da...
4,what are reservoir computers used for today,reservoir computers were very popular in the ...,"['machine-learning', 'recurrent-neural-network...",what are reservoir computers used for today ...,3,reservoir computer today reservoir computer po...


In [None]:
df['Tags'] = df['Tags'].apply(lambda x: literal_eval(x))
all_tags = [item for sublist in df['Tags'].values for item in sublist]
print(len(all_tags))

In [None]:
my_set = set(all_tags)
unique_tags = list(my_set)
print(len(unique_tags))
from collections import Counter
counts = Counter(all_tags)
counts.most_common(20)

In [None]:
frequencies_words = counts.most_common(20)
tags_features = [word[0] for word in frequencies_words]

In [None]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

df['Tags'] = df['Tags'].apply(lambda x: most_common(x))
df['Tags'] = df['Tags'].apply(lambda x: x if len(x)>0 else None)

In [None]:
df.dropna(subset=['Tags'], inplace=True)
df.shape

# We are losing 10k rows of data, but it is for the greater good

In [None]:
X = df['Text_Cleaned']
y = df['Tags']

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

bin = MultiLabelBinarizer()
y_bin = bin.fit_transform(y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = 'word', max_features=3000, ngram_range=(1,3), stop_words='english')
X = tfidf.fit_transform(df['Text'])
print(X.shape, y_bin.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score, f1_score

lr = LogisticRegression()
svc = LinearSVC()
mnb = MultinomialNB()
bnb = BernoulliNB()
sgd = SGDClassifier()

for classifier in [lr, svc, sgd, mnb, bnb]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classifier.__class__.__name__, accuracy_score(y_test, y_pred))
    print(classifier.__class__.__name__, f1_score(y_test, y_pred, average='micro'))

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
token = Tokenizer()
token.fit_on_texts(df['Text_Cleaned'].to_list())

In [None]:
print(len(token.word_counts))
vocab_size = len(token.word_index) + 1
print(vocab_size)

In [None]:
encoded_text = token.texts_to_sequences(df['Text_Cleaned'].to_list())

In [None]:
max_length = 250
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [None]:
X.shape, y_bin.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, random_state = 42, test_size = 0.3)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_bin.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
                ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=callbacks)

# Word2Vec Model

In [5]:
# Tokenize the text for word2vec model
import spacy
nlp = spacy.load('en_core_web_trf')

def preprocess_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop]

In [8]:
list_of_sent = [sen.split() for sen in df['Text_Cleaned'].values]

In [13]:
# Create sentenceembedding using word2vec

model_w2v = Word2Vec(list_of_sent, vector_size=100, window=3, min_count=1, workers=4,sg=1)

In [15]:
model_w2v.wv.most_similar('pip')

[('install', 0.8860217332839966),
 ('spyder', 0.8797866106033325),
 ('cli', 0.8775292038917542),
 ('python37', 0.8757234811782837),
 ('rstudio', 0.8720542788505554),
 ('sudo', 0.8712289333343506),
 ('installation', 0.8690873980522156),
 ('rpy2', 0.8649067878723145),
 ('baselines3', 0.8648353219032288),
 ('installer', 0.8644696474075317)]

In [19]:
import string
from nltk import word_tokenize
from nltk.corpus import stopwords


def preprocess_and_tokenize(text):
    # Remove punctuation and convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    return tokens

def tag_question(question, model, topn=5):
    # Preprocess and tokenize the question
    question_tokens = preprocess_and_tokenize(question)
    
    # Calculate the average Word2Vec embedding for the question
    question_vector = sum(model.wv[word] for word in question_tokens if word in model.wv) / len(question_tokens)
    
    # Find the most similar tags
    similar_tags = model.wv.most_similar(positive=[question_vector], topn=topn)
    
    return similar_tags

In [18]:
question = "How to train a neural network for image classification?"
tags = tag_question(question, model_w2v,3)
print(tags)

[('siamese', 0.9015004634857178), ('ffnn', 0.8987703323364258), ('layered', 0.8956176042556763)]
