#**Mount Google Drive & Load JSON Dataset**

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Import necessary libraries for Google Drive and file handling
import json

# Specify the path to your JSON file on Google Drive
file_path = '/content/drive/MyDrive/Data_Analysis_using_python/reuters_word_index.json'  # Replace with your actual file path

# Load the JSON dataset
with open(file_path, 'r') as file:
    data = json.load(file)

# Display the first few entries to verify loading
print("Sample data:", list(data.items())[:500])

Sample data: [('mdbl', 10996), ('fawc', 16260), ('degussa', 12089), ('woods', 8803), ('hanging', 13796), ('localized', 20672), ('sation', 20673), ('chanthaburi', 20675), ('refunding', 10997), ('hermann', 8804), ('passsengers', 20676), ('stipulate', 20677), ('heublein', 8352), ('screaming', 20713), ('tcby', 16261), ('four', 185), ('grains', 1642), ('broiler', 20680), ('wooden', 12090), ('wednesday', 1220), ('highveld', 13797), ('duffour', 7593), ('0053', 20681), ('elections', 3914), ('270', 2563), ('271', 3551), ('272', 5113), ('273', 3552), ('274', 3400), ('rudman', 7975), ('276', 3401), ('277', 3478), ('278', 3632), ('279', 4309), ('dormancy', 9381), ('errors', 7247), ('deferred', 3086), ('sptnd', 20683), ('cooking', 8805), ('stratabit', 20684), ('designing', 16262), ('metalurgicos', 20685), ('databank', 13798), ('300er', 20686), ('shocks', 20687), ('nawg', 7972), ('tnta', 20688), ('perforations', 20689), ('affiliates', 2891), ('27p', 20690), ('ching', 16263), ('china', 595), ('wagyu'

#**Preprocess Vocabulary (Tokenize & Clean Words)**

In [4]:
# Text cleaning and tokenization
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

nltk.download('stopwords')

words = list(data.keys())

tokenizer = RegexpTokenizer(r'\b[a-zA-Z]{2,}\b')
stop_words = set(stopwords.words('english'))

def preprocess(word):
    word = word.lower()
    return [t for t in tokenizer.tokenize(word) if t not in stop_words]

tokens = [preprocess(w) for w in words]
tokens = [t for sub in tokens for t in sub]  # Flatten
print("Clean tokens:", tokens[:10])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Clean tokens: ['mdbl', 'fawc', 'degussa', 'woods', 'hanging', 'localized', 'sation', 'chanthaburi', 'refunding', 'hermann']


#**Install Gensim & Fix Compatibility**

In [5]:
!pip install gensim



In [8]:
# 💥 Force fresh installs
!pip uninstall -y numpy scipy gensim
!pip install -U numpy==1.24.3 scipy==1.11.3 gensim==4.3.1 --force-reinstall --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.7/61.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.24.3 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 1.24.3 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.24.3 which is incompatible.
albumentations 2.0.5 requires numpy>=1.24.4, but you have numpy 1.24.3 which is incompatible.
scikit-image 0.25.2 requires scipy>=1.11.4, but you have scipy 1.11.3 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, 

#**Train Word2Vec Model on Cleaned Tokens**

In [9]:
from gensim.models import Word2Vec
print("Gensim import successful!")

Gensim import successful!


In [10]:
from gensim.models import Word2Vec

# Train Word2Vec on single-token sentences
sentences = [[token] for token in tokens]
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
model.save("reuters_word2vec.model")

# Show vector for a word
sample_word = "china"
if sample_word in model.wv:
    print(f"Vector for '{sample_word}':", model.wv[sample_word][:10])
else:
    print(f"'{sample_word}' not in vocabulary.")

Vector for 'china': [ 7.1148876e-05  4.1307653e-03  5.5061770e-03  4.2398069e-03
 -6.1513605e-03  6.8574082e-03 -3.8503288e-04  2.7118577e-03
  6.7851674e-03  9.3285581e-03]


#**Convert Tokens to Sequences & Prepare Labels**

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split

# Build vocabulary index
vocab = {word: idx + 1 for idx, word in enumerate(set(tokens))}
vocab_size = len(vocab) + 1

# Convert words into 1D integer array (since you're using 1-word sequences)
indexed_tokens = np.array([vocab[word] for word in tokens]).reshape(-1, 1)

# Create synthetic binary labels (0 or 1)
labels = np.random.randint(0, 2, size=(len(indexed_tokens), ))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(indexed_tokens, labels, test_size=0.2, random_state=42)

#**Build Embedding Matrix from Word2Vec Vectors**

In [12]:
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, idx in vocab.items():
    if word in model.wv:
        embedding_matrix[idx] = model.wv[word]

#**Install PyTorch & Prepare Datasets**

In [13]:
!pip install torch --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m75.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)

X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1)

# Create loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

#**Define & Train PyTorch Neural Network Model**

In [15]:
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(1, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        return self.sigmoid(self.fc2(x))

model = SimpleNet()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [16]:
# Training loop
for epoch in range(10):
    model.train()
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/10 - Loss: {loss.item():.4f}")

Epoch 1/10 - Loss: 47.3684
Epoch 2/10 - Loss: 57.8947
Epoch 3/10 - Loss: 73.6842
Epoch 4/10 - Loss: 52.6316
Epoch 5/10 - Loss: 52.6316
Epoch 6/10 - Loss: 63.1579
Epoch 7/10 - Loss: 36.8421
Epoch 8/10 - Loss: 31.5789
Epoch 9/10 - Loss: 47.3684
Epoch 10/10 - Loss: 52.6316


#**Evaluate Model Accuracy**

In [17]:
# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        predictions = model(batch_X).round()
        correct += (predictions == batch_y).sum().item()
        total += batch_y.size(0)

print(f"\n✅ PyTorch Test Accuracy: {correct / total:.4f}")


✅ PyTorch Test Accuracy: 0.4992


#**Check Prediction on New Input Token**



In [18]:
# Pick a word
word = "china"

# Check if it exists in vocab
if word in vocab:
    word_index = torch.FloatTensor([[vocab[word]]])  # shape [1, 1]

    model.eval()
    with torch.no_grad():
        prediction = model(word_index)
        predicted_class = int(prediction.round().item())
        print(f"Input word: '{word}'")
        print(f"Predicted Class: {predicted_class}")
else:
    print(f"'{word}' not in vocabulary.")

Input word: 'china'
Predicted Class: 1


#**Try Multiple Words at Once**

In [19]:
test_words = ["oil", "bank", "market", "trade", "currency"]

model.eval()
for word in test_words:
    if word in vocab:
        x = torch.FloatTensor([[vocab[word]]])
        with torch.no_grad():
            y_pred = model(x).item()
            print(f"{word:>10} → Confidence: {y_pred:.4f} → Class: {int(round(y_pred))}")
    else:
        print(f"{word:>10} → Not in vocab")

       oil → Confidence: 1.0000 → Class: 1
      bank → Confidence: 1.0000 → Class: 1
    market → Confidence: 1.0000 → Class: 1
     trade → Confidence: 1.0000 → Class: 1
  currency → Confidence: 1.0000 → Class: 1


#**Test the Word2Vec Model**

In [20]:
# After Word2Vec training
word2vec_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("reuters_word2vec.model")



# Word similarity test
if "oil" in word2vec_model.wv:
    similar = word2vec_model.wv.most_similar("oil", topn=5)
    print("Words most similar to 'oil':")
    for word, score in similar:
        print(f"{word}: {score:.4f}")
else:
    print("'oil' not found in Word2Vec vocabulary.")

Words most similar to 'oil':
hubert: 0.4006
onto: 0.3917
manly: 0.3883
wild: 0.3862
westcoast: 0.3819


#**Visual Check: Accuracy Was Actually Trained**

In [21]:
print(f"Final model test accuracy: {correct / total:.4f}")

Final model test accuracy: 0.4992
