In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
train_data_path = '/content/drive/MyDrive/tf-idf/train_data.txt'
train_data = pd.read_csv(train_data_path, sep="\t", header=None)

split_data = train_data[0].str.split(' ::: ', expand=True)

df = pd.DataFrame()
df['label'] = split_data[2]
df['text'] = split_data[3]

df.head()

Unnamed: 0,label,text
0,drama,Listening in to a conversation between his doc...
1,thriller,A brother and sister with a past incestuous re...
2,adult,As the bus empties the students for their fiel...
3,drama,To help their unemployed father make ends meet...
4,drama,The film's title refers not only to the un-rec...


In [None]:

stop_words_path = '/content/drive/MyDrive/tf-idf/stop_words_english.txt'

with open(stop_words_path, 'r') as file:
    stop_words = {line.strip() for line in file}

def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df['text'] = df['text'].apply(remove_stop_words)
df.head()


Unnamed: 0,label,text
0,drama,"Listening conversation doctor parents, 10-year..."
1,thriller,brother sister incestuous relationship current...
2,adult,bus empties students field trip Museum Natural...
3,drama,"unemployed father ends meet, Edith twin sister..."
4,drama,film's title refers un-recovered bodies ground...


In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/68.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199771 sha256=d94ee2de850282c17603b67a4d643d6be528084345d1fcc61c3150a7f317af80
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fa

In [None]:


X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

train_fasttext = pd.concat([y_train, X_train], axis=1)
train_fasttext['label'] = '__label__' + train_fasttext['label'].astype(str)
train_fasttext.to_csv('/content/drive/MyDrive/tf-idf/train_fasttext.txt', index=False, header=False, sep=' ')


import fasttext
from sklearn.metrics import accuracy_score

model = fasttext.train_supervised('/content/drive/MyDrive/tf-idf/train_fasttext.txt',
                                  lr=0.2,
                                  dim=100,
                                  ws=5,
                                  epoch=30,
                                  minCount=1,
                                  wordNgrams=5,
                                  loss='softmax',
                                  verbose=2)

y_pred = [model.predict(text)[0][0].replace('__label__', '') for text in X_test]
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.4981093793230656


# Take too much time


In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Concatenate, Embedding, Layer
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow.keras.backend as K
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

y_train_encoded = pd.get_dummies(y_train)
y_test_encoded = pd.get_dummies(y_test)

# Tokenization and Word2Vec
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_len = max(len(x) for x in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Building Word2Vec model
word2vec = Word2Vec(sentences=[tokenizer.word_index.keys()], vector_size=100, window=5, min_count=1)
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec.wv:
        embedding_matrix[i] = word2vec.wv[word]

# Building TextRCNN Model
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(len(tokenizer.word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
lstm_layer = LSTM(100, return_sequences=True)(embedding_layer)
conv_layer = Conv1D(100, 3, activation='relu')(lstm_layer)
pooling_layer = MaxPooling1D(pool_size=2)(conv_layer)
flatten_layer = Flatten()(pooling_layer)
output_layer = Dense(len(y_train.unique()), activation='softmax')(flatten_layer)
model_rcnn = Model(inputs=input_layer, outputs=output_layer)
model_rcnn.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model_rcnn.fit(X_train_pad, y_train_encoded, batch_size=64, epochs=5, validation_split=0.2)
loss, accuracy = model_rnn.evaluate(X_test_pad, y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5

KeyboardInterrupt: ignored

1
2

1
2
