In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import nltk

nltk.download('punkt')  # Download NLTK tokenizer data (only needed once)
data = pd.read_csv('C:\\Users\\Ankita\\Downloads\\Python_test.csv')
print(data.head())

  ICD_Codes                                    Description
0       E08  Diabetes mellitus due to underlying condition
1       E09     Drug or chemical induced diabetes mellitus
2       E10                       Type 1 diabetes mellitus
3       E11                       Type 2 diabetes mellitus
4       E13              Other specified diabetes mellitus


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ankita\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def preprocess_and_tokenize(description):
    # Lowercase the text
    description = description.lower()
    
    # Remove special characters and punctuation
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)
    
    # Tokenize the text
    tokens = word_tokenize(description)
    
    return tokens

In [4]:
# Apply tokenization to the "Description" column
data['Tokenized_Description'] = data['Description'].apply(preprocess_and_tokenize)

# Check the updated data
print(data[['Description', 'Tokenized_Description']].head())

                                     Description  \
0  Diabetes mellitus due to underlying condition   
1     Drug or chemical induced diabetes mellitus   
2                       Type 1 diabetes mellitus   
3                       Type 2 diabetes mellitus   
4              Other specified diabetes mellitus   

                               Tokenized_Description  
0  [diabetes, mellitus, due, to, underlying, cond...  
1  [drug, or, chemical, induced, diabetes, mellitus]  
2                      [type, 1, diabetes, mellitus]  
3                      [type, 2, diabetes, mellitus]  
4             [other, specified, diabetes, mellitus]  


In [5]:
print(data)

   ICD_Codes                                    Description  \
0        E08  Diabetes mellitus due to underlying condition   
1        E09     Drug or chemical induced diabetes mellitus   
2        E10                       Type 1 diabetes mellitus   
3        E11                       Type 2 diabetes mellitus   
4        E13              Other specified diabetes mellitus   
5       N181                Chronic kidney disease, stage 1   
6       N182                Chronic kidney disease, stage 2   
7       N183                Chronic kidney disease, stage 3   
8       N184                Chronic kidney disease, stage 4   
9       N185                Chronic kidney disease, stage 5   
10      N186                        End-stage renal disease   
11      N189            Chronic kidney disease, unspecified   

                                Tokenized_Description  
0   [diabetes, mellitus, due, to, underlying, cond...  
1   [drug, or, chemical, induced, diabetes, mellitus]  
2           

In [6]:
data = pd.DataFrame({
    'ICD_Code': ['E08', 'E09', 'E10', 'E11', 'E13', 'N181', 'N182', 'N183', 'N184', 'N185', 'N186', 'N189'],
    'Description': [
        'Diabetes mellitus due to underlying condition',
        'Drug or chemical induced diabetes mellitus',
        'Type 1 diabetes mellitus',
        'Type 2 diabetes mellitus',
        'Other specified diabetes mellitus',
        'Chronic kidney disease, stage 1',
        'Chronic kidney disease, stage 2',
        'Chronic kidney disease, stage 3',
        'Chronic kidney disease, stage 4',
        'Chronic kidney disease, stage 5',
        'End-stage renal disease',
        'Chronic kidney disease, unspecified'
    ]
})

# Define the mapping of diagnosis codes to numerical labels
code_to_label = {
    'E08': 1,
    'E09': 2,
    'E10': 3,
    'E11': 4,
    'E13': 5,
    'N181': 6,
    'N182': 7,
    'N183': 8,
    'N184': 9,
    'N185': 10,
    'N186': 11,
    'N189': 12,
}

# Create a new column 'Label' by mapping the 'ICD_Code' column to numerical labels
data['Label'] = data['ICD_Code'].map(code_to_label)

# Print the updated data
print(data)

   ICD_Code                                    Description  Label
0       E08  Diabetes mellitus due to underlying condition      1
1       E09     Drug or chemical induced diabetes mellitus      2
2       E10                       Type 1 diabetes mellitus      3
3       E11                       Type 2 diabetes mellitus      4
4       E13              Other specified diabetes mellitus      5
5      N181                Chronic kidney disease, stage 1      6
6      N182                Chronic kidney disease, stage 2      7
7      N183                Chronic kidney disease, stage 3      8
8      N184                Chronic kidney disease, stage 4      9
9      N185                Chronic kidney disease, stage 5     10
10     N186                        End-stage renal disease     11
11     N189            Chronic kidney disease, unspecified     12


In [17]:
pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/59/ed/548f6f686845d386a727a51a3daa411d95fc599649a2d54705f6773ac259/scikit_learn-1.3.1-cp310-cp310-win_amd64.whl.metadata
  Downloading scikit_learn-1.3.1-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.5.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.5.0 from https://files.pythonhosted.org/packages/70/03/485f73046134400ea25d3cb178c5e6728f9b165f79d09638ecb44ee0e9b1/scipy-1.11.2-cp310-cp310-win_amd64.whl.metadata
  Downloading scipy-1.11.2-cp310-cp310-win_amd64.whl.metadata (59 kB)
     ---------------------------------------- 0.0/59.1 kB ? eta -:--:--
     ---------------------------------------- 59.1/59.1 kB 3.3 MB/s eta 0:00:00
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b

In [18]:
import sklearn
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets (80% training, 20% testing)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Print the sizes of the training and testing sets
print("Training set size:", len(train_data))
print("Testing set size:", len(test_data))

Training set size: 9
Testing set size: 3


In [42]:
import gensim
from gensim.models import Word2Vec

# Sample data
tokenized_descriptions = data['Description'].apply(preprocess_and_tokenize)

# Train a Word2Vec model
model = Word2Vec(sentences=tokenized_descriptions, vector_size=100, window=5, min_count=1, sg=0)

# Convert tokenized text data into Word2Vec vectors
x = [model.wv[token] for tokens in tokenized_descriptions for token in tokens]
y = data['Label']
# Print the Word2Vec vectors
print(word2vec_vectors)

[array([-8.2441056e-03,  9.3003456e-03, -2.0066013e-04, -1.9677025e-03,
        4.6060821e-03, -4.0952927e-03,  2.7430535e-03,  6.9450000e-03,
        6.0620159e-03, -7.5101568e-03,  9.3816333e-03,  4.6673468e-03,
        3.9637010e-03, -6.2418417e-03,  8.4569082e-03, -2.1504730e-03,
        8.8273818e-03, -5.3586191e-03, -8.1297318e-03,  6.8183513e-03,
        1.6704592e-03, -2.1981017e-03,  9.5139602e-03,  9.4931209e-03,
       -9.7737210e-03,  2.5046577e-03,  6.1588003e-03,  3.8749112e-03,
        2.0201779e-03,  4.2857855e-04,  6.7672366e-04, -3.8228012e-03,
       -7.1409047e-03, -2.0901493e-03,  3.9192075e-03,  8.8188276e-03,
        9.2627294e-03, -5.9755058e-03, -9.4045643e-03,  9.7657172e-03,
        3.4328592e-03,  5.1638312e-03,  6.2802024e-03, -2.8005317e-03,
        7.3276819e-03,  2.8342444e-03,  2.8679157e-03, -2.3831653e-03,
       -3.1270799e-03, -2.3709587e-03,  4.2774454e-03,  7.5728371e-05,
       -9.5832301e-03, -9.6661691e-03, -6.1524236e-03, -1.3351504e-04,
     

In [44]:
from sklearn.preprocessing import LabelEncoder

In [46]:
# Prepare input data and labels
# Convert labels to one-hot encoding
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_one_hot = tf.keras.utils.to_categorical(y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y_one_hot, test_size=0.2, random_state=42)


ValueError: Found input variables with inconsistent numbers of samples: [56, 12]

In [20]:
pip install -U gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/ab/b0/d58dc405fd60ab546ca714321235dc2d455b2dc06bfb4fc1092940c749fc/gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata
  Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl.metadata (8.5 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages/fc/d9/d97f1db64b09278aba64e8c81b5d322d436132df5741c518f3823824fae0/smart_open-6.4.0-py3-none-any.whl.metadata
  Downloading smart_open-6.4.0-py3-none-any.whl.metadata (21 kB)
Downloading gensim-4.3.2-cp310-cp310-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.2/24.0 MB 2.1 MB/s eta 0:00:12
   ---------------------------------------- 0.2/24.0 MB 2.0 MB/s eta 0:00:13
    --------------------------

In [37]:
# Define the neural network model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(100,)))  # Adjust input shape based on your embeddings
model.add(Dense(32, activation='relu'))
model.add(Dense(12, activation='softmax'))  # Output layer with 12 classes (labels)

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(np.array(X_test), np.array(y_test))
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


NameError: name 'X_train' is not defined

In [25]:
pip install keras

Collecting keras
  Obtaining dependency information for keras from https://files.pythonhosted.org/packages/fe/58/34d4d8f1aa11120c2d36d7ad27d0526164b1a8ae45990a2fede31d0e59bf/keras-2.14.0-py3-none-any.whl.metadata
  Downloading keras-2.14.0-py3-none-any.whl.metadata (2.4 kB)
Downloading keras-2.14.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.7 MB 15.4 MB/s eta 0:00:01
   ------- -------------------------------- 0.3/1.7 MB 4.1 MB/s eta 0:00:01
   ----------------------------- ---------- 1.3/1.7 MB 10.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.4/1.7 MB 9.9 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 7.8 MB/s eta 0:00:00
Installing collected packages: keras
Successfully installed keras-2.14.0
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Prepare input data and labels
X = [model.wv[token] for tokens in tokenized_descriptions for token in tokens]
y = data['Label']

# Convert labels to one-hot encoding
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_one_hot = tf.keras.utils.to_categorical(y_encoded)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)


In [28]:
pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/ba/7c/b971f2485155917ecdcebb210e021e36a6b65457394590be01cc61515310/tensorflow-2.13.0-cp310-cp310-win_amd64.whl.metadata
  Downloading tensorflow-2.13.0-cp310-cp310-win_amd64.whl.metadata (2.6 kB)
Collecting tensorflow-intel==2.13.0 (from tensorflow)
  Obtaining dependency information for tensorflow-intel==2.13.0 from https://files.pythonhosted.org/packages/40/fa/98115f6fe4d92e1962f549917be2dc8e369853b7e404191996fedaaf4dd6/tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl.metadata
  Downloading tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.13.0->tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Ankita\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~umpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-65e29aac85b9409a6008e2dc84b1cc09.dll'
Consider using the `--user` option or check the permissions.

