In [None]:
import pandas as pd
from ast import literal_eval

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df = pd.read_csv('Data/cleaned_data_2.csv')
df.head()

In [None]:
df['Tags'] = df['Tags'].apply(lambda x: literal_eval(x))
all_tags = [item for sublist in df['Tags'].values for item in sublist]
print(len(all_tags))

In [None]:
my_set = set(all_tags)
unique_tags = list(my_set)
print(len(unique_tags))
from collections import Counter
counts = Counter(all_tags)
counts.most_common(20)

In [None]:
frequencies_words = counts.most_common(20)
tags_features = [word[0] for word in frequencies_words]

In [None]:
def most_common(tags):
    tags_filtered = []
    for i in range(0, len(tags)):
        if tags[i] in tags_features:
            tags_filtered.append(tags[i])
    return tags_filtered

df['Tags'] = df['Tags'].apply(lambda x: most_common(x))
df['Tags'] = df['Tags'].apply(lambda x: x if len(x)>0 else None)

In [None]:
df.dropna(subset=['Tags'], inplace=True)
df.shape

# We are losing 10k rows of data, but it is for the greater good

In [None]:
X = df['Text_Cleaned']
y = df['Tags']

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

bin = MultiLabelBinarizer()
y_bin = bin.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = 'word', max_features=1000)
X = tfidf.fit_transform(df['Text'])

In [None]:
X.shape, y_bin.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

lr = LogisticRegression()
svc = LinearSVC()
sgd = SGDClassifier()

for classifier in [lr, svc, sgd]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classifier.__class__.__name__, accuracy_score(y_test, y_pred))
    print(classifier.__class__.__name__, f1_score(y_test, y_pred, average='micro'))

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
token = Tokenizer()
token.fit_on_texts(df['Text_Cleaned'].to_list())

In [None]:
print(len(token.word_counts))
vocab_size = len(token.word_index) + 1
print(vocab_size)

In [None]:
encoded_text = token.texts_to_sequences(df['Text_Cleaned'].to_list())

In [None]:
max_length = 250
X = pad_sequences(encoded_text, maxlen=max_length, padding='post')

In [None]:
X.shape, y_bin.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_bin, random_state = 42, test_size = 0.3)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_bin.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
                ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_split=0.1, callbacks=callbacks)