In [22]:
import pandas as pd
import numpy as np
from ast import literal_eval

import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    hamming_loss,
)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

In [23]:
df = pd.read_csv("Data/cleaned_data_2.csv")

In [24]:
df["Tags"] = df["Tags"].apply(literal_eval)

In [25]:
# Assuming your dataset has 'title', 'body', and 'tags' columns
multi_label = MultiLabelBinarizer()
multi_label.fit(df["Tags"])
y = multi_label.transform(df["Tags"])

In [26]:
tfidf = TfidfVectorizer(max_df=0.8, max_features=1000)
# Tfidf = tfidf.fit_transform(data.text_cleaned)
X_tf = tfidf.fit_transform(df["Text_Cleaned"])

In [27]:
X_train_tf, X_test_tf, y_train_tf, y_test_tf = train_test_split(
    X_tf, y, test_size=0.2, random_state=42
)

In [28]:
# Making a list of model best suited for text and multitags classification

lr = LogisticRegression()
ovr = OneVsRestClassifier(lr)
ovr.fit(X_train_tf, y_train_tf)
y_pred_ovr = ovr.predict_proba(X_test_tf)



In [61]:
# set threshold value
t = 0.5

# convert to integers
y = (y_pred_ovr >= t).astype(int)
print(f1_score(y_test_tf, y, average="micro"))

from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test_tf, y) * 100, "%")

0.2683554948231461
Accuracy: 3.155414404261858 %


In [63]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [68]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["Text_Cleaned"])

In [69]:
len(tokenizer.word_index)

131482

In [70]:
# check unique words count
vocab_size = len(tokenizer.word_index) + 1
vocab_size

131483

In [71]:
sequences = tokenizer.texts_to_sequences(df["Text_Cleaned"])

In [72]:
print(df["Text"][0], "\n"), print(sequences[0])

brain segmentation to 3d model  my goal is to take a dataset of brain tumor segmentations use computer vision to locate and highlight the tumor in each slice then combine each slice almost like stacking to make a 3d model in this 3d model you can see exactly where the tumor is located i am able to use cv to highlight tumors in individual layers but the challenge comes in stacking how would i go about stacking images to make 3d models with this i would have to remove empty outside space in individual images and be able to give each image some depth how would i go about doing this is there any existing library that can do this  

[1737, 1472, 1244, 4, 500, 29, 1737, 5086, 1472, 9, 805, 1738, 2369, 2712, 5086, 2388, 590, 2388, 7152, 1244, 4, 1244, 4, 418, 5086, 2369, 218, 9, 586, 2712, 5086, 370, 47, 1641, 168, 1118, 1118, 55, 1244, 4, 526, 1417, 208, 370, 55, 218, 55, 782, 364, 324]


(None, None)

In [73]:
seq_lengths = []

for i in sequences:
    seq_lengths.append(len(i))

In [74]:
print("30th percentile: ", pd.Series(seq_lengths).quantile(0.3))
print("40th percentile: ", pd.Series(seq_lengths).quantile(0.4))
print("50th percentile: ", pd.Series(seq_lengths).quantile(0.5))
print("60th percentile: ", pd.Series(seq_lengths).quantile(0.6))
print("70th percentile: ", pd.Series(seq_lengths).quantile(0.7))
print("80th percentile: ", pd.Series(seq_lengths).quantile(0.8))
print("90th percentile: ", pd.Series(seq_lengths).quantile(0.9))
print("95th percentile: ", pd.Series(seq_lengths).quantile(0.95))
print("99th percentile: ", pd.Series(seq_lengths).quantile(0.99))

30th percentile:  43.0
40th percentile:  53.0
50th percentile:  64.0
60th percentile:  77.0
70th percentile:  96.0
80th percentile:  125.0
90th percentile:  185.0
95th percentile:  268.0
99th percentile:  570.9800000000032


In [75]:
max_length = 125

# padding
padded_seq = pad_sequences(sequences, maxlen=max_length)

In [76]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df["Tags"])
y = multilabel_binarizer.transform(df["Tags"])

In [77]:
padded_seq.shape, y.shape

((48803, 125), (48803, 2686))

In [78]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(
    padded_seq, y, test_size=0.2, random_state=9
)

In [79]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [81]:
model = Sequential()
model.add(Embedding(vocab_size + 1, 128, input_length=max_length))
model.add(Dropout(0.15))
model.add(Conv1D(300, 5, padding="valid", activation="relu", strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(2686, activation="sigmoid"))
# model.add(Activation('sigmoid'))

In [82]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 125, 128)          16829952  
                                                                 
 dropout (Dropout)           (None, 125, 128)          0         
                                                                 
 conv1d (Conv1D)             (None, 121, 300)          192300    
                                                                 
 global_max_pooling1d (Glob  (None, 300)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 2686)              808486    
                                                                 
Total params: 17830738 (68.02 MB)
Trainable params: 17830738 (68.02 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [83]:
callbacks = [
    EarlyStopping(patience=3),
    ModelCheckpoint(filepath="model-conv1d_v1.h5", save_best_only=True),
]

In [84]:
# train model
history = model.fit(
    x_train,
    y_train,
    epochs=15,
    batch_size=128,
    validation_split=0.1,
    callbacks=callbacks,
)

Epoch 1/15

  saving_api.save_model(


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15