In [None]:
!pip install numpy pandas tensorflow scikit-learn



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape

# Step 1: Load the dataset
data = pd.read_csv('/content/parent-synonym-embedding.csv')  # Replace with your dataset path

# Step 2: Convert embeddings from string to numpy arrays
data['Parent Embedding'] = data['Parent Embedding'].apply(lambda x: np.array(eval(x)))
for i in range(1, 6):
    data[f'Synonym-{i} Embedding'] = data[f'Synonym-{i} Embedding'].apply(lambda x: np.array(eval(x)))

# Step 3: Prepare input (X) and output (y) data
X = np.array(data['Parent Embedding'].tolist())
y = np.array([row[[f'Synonym-{i} Embedding' for i in range(1, 6)]].values.tolist() for _, row in data.iterrows()])

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Define the FFNN model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))  # Input layer
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dense(y_train.shape[1] * X_train.shape[1], activation='linear'))   # Output layer with correct shape
model.add(Reshape((y_train.shape[1], X_train.shape[1]))) # Reshape to (batch_size, 5, 20)

# Step 6: Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Step 7: Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Step 8: Evaluate the model on the test set
loss, mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test MAE: {mae}')

# Step 9: Save the trained model
model.save('paraphrase_model.h5', save_format='tf') # Changed to model.save and specify format
print("Model saved as 'paraphrase_model.h5'")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.0513 - mae: 0.1963 - val_loss: 0.0474 - val_mae: 0.1901
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0465 - mae: 0.1881 - val_loss: 0.0447 - val_mae: 0.1846
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.0435 - mae: 0.1817 - val_loss: 0.0421 - val_mae: 0.1783
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0408 - mae: 0.1746 - val_loss: 0.0394 - val_mae: 0.1708
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0376 - mae: 0.1659 - val_loss: 0.0365 - val_mae: 0.1621
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0336 - mae: 0.1542 - val_loss: 0.0334 - val_mae: 0.1526
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 0.0306 - mae: 0.



Test Loss: 0.006528886500746012, Test MAE: 0.05760196968913078
Model saved as 'paraphrase_model.h5'


In [None]:
import numpy as np
import pandas as pd
import sys
from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the saved model
print("Loading model...", flush=True)
model = load_model('paraphrase_model.h5', compile=False)  # Load without compiling
model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Recompile
print("Model loaded successfully.", flush=True)

# Step 2: Load word_vectors.csv (parent words and their embeddings)
print("Loading word_vectors.csv...", flush=True)
word_vectors_df = pd.read_csv('/content/word_vectors.csv')
print(f"Columns in word_vectors.csv: {word_vectors_df.columns}", flush=True)

embedding_column = word_vectors_df.columns[1]  # Assuming embedding is the second column
# Removed the unnecessary apply call that was causing the error
# word_vectors_df[embedding_column] = word_vectors_df[embedding_column].apply(lambda x: np.array(eval(x)))
word_vectors = dict(zip(word_vectors_df['word'], word_vectors_df[embedding_column]))
print(word_vectors)

# Step 3: Load synonyms.csv (complete vocabulary of synonyms and their embeddings)
print("Loading synonyms.csv...", flush=True)
synonyms_df = pd.read_csv('/content/synonyms.csv', header=None)
synonyms_df.rename(columns={0: 'Synonym'}, inplace=True)

# Extract embeddings as NumPy array and convert properly, remove the applymap call
synonym_embeddings = synonyms_df.iloc[:, 1:].values
synonyms = dict(zip(synonyms_df['Synonym'], synonym_embeddings))
print(f"Loaded {len(synonyms)} synonyms.", flush=True)

# Step 4: Function to find the closest synonym word
def find_closest_synonym(embedding, synonyms):
    closest_word = None
    max_similarity = -1

    for word, vector in synonyms.items():
        similarity = cosine_similarity([embedding], [vector])[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            closest_word = word

    print(f"Closest synonym found: {closest_word} (Similarity: {max_similarity})", flush=True)
    return closest_word

# Step 5: Function to paraphrase a sentence
def paraphrase_sentence(sentence, word_vectors, synonyms, model):
    print(f"Paraphrasing sentence: {sentence}", flush=True)
    words = sentence.split()
    paraphrased_sentence = []

    for word in words:
        if word in word_vectors:
            # Step 1: Get the parent embedding
            parent_embedding = word_vectors[word]
            print(f"Embedding for '{word}': {parent_embedding}", flush=True)

            try:
                # Step 2: Predict the synonym embedding
                predicted_synonym_embedding = model.predict(np.array([parent_embedding]))[0]
                print(f"Predicted embedding for '{word}': {predicted_synonym_embedding}", flush=True)

                # Step 3: Find the closest synonym word
                closest_synonym = find_closest_synonym(predicted_synonym_embedding, synonyms)
                paraphrased_sentence.append(closest_synonym)
            except Exception as e:
                print(f"Error predicting synonym for '{word}': {e}", flush=True)
                paraphrased_sentence.append(word)  # Keep original word on failure
        else:
            print(f"Word '{word}' not found in word vectors, keeping it unchanged.", flush=True)
            paraphrased_sentence.append(word)

    paraphrased_text = ' '.join(paraphrased_sentence)
    print(f"Final paraphrased sentence: {paraphrased_text}", flush=True)
    return paraphrased_text

# Step 6: Example usage
if __name__ == "__main__":
    print("Starting paraphrasing process...", flush=True)

    # Input sentence
    sentence = "The quick brown fox jumped over the huge wall"

    # Paraphrase the sentence
    paraphrased_sentence = paraphrase_sentence(sentence, word_vectors, synonyms, model)

    print(f'Original Sentence: {sentence}', flush=True)
    print(f'Paraphrased Sentence: {paraphrased_sentence}', flush=True)

    sys.stdout.flush()  # Force output if running in buffered environments

Loading model...
Model loaded successfully.
Loading word_vectors.csv...
Columns in word_vectors.csv: Index(['word', 'vector'], dtype='object')
{-0.32571685: 0.2370348, -0.2928181: 0.114224516, -0.23087312: 0.03726288, 0.079329096: -0.063414566, 0.07696159: -0.29203844, -0.25135127: 0.22717102, -0.19286314: 0.2383442, -0.19125912: 0.23251452, -0.1575665: 0.29443133, -0.34171999: -0.2839549, -0.24350148: -0.19983734, -0.25435737: -0.26340753, -0.03231681: 0.2888182, 0.04287957: 0.2638361, 0.29722205: -0.30206951, -0.06936145: 0.2859176, -0.10753195: -0.28023738, 0.25185424: -0.19522798, -0.29206997: 0.12095589, -0.22663373: -0.23525415, 0.21334283: 0.22564988, -0.31323645: -0.20591897, 0.17965874: 0.17990425, 0.40626368: -0.30814192, 0.15974215: 0.31327423, 0.06825441: 0.14610286, 0.2754313: -0.3585307, -0.26235715: -0.27208456, -0.022279005: 0.28042865, 0.36751276: -0.27252883, 0.1389837: 0.25283426, 0.34448788: -0.26373684, 0.039179407: 0.24362923, -0.14345916: -0.088127986, 0.1473631:

In [None]:
import numpy as np
import pandas as pd
import csv
from tensorflow.keras.models import load_model
from sklearn.metrics.pairwise import cosine_similarity

# Load trained model
model = load_model('paraphrase_model.h5', compile=False)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Load word embeddings
file_path = "/content/word_vectors.csv"
existing_vectors = {}
with open(file_path, 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader, None)  # Skip header if present
    for row in reader:
        if len(row) < 2:
            continue  # Skip incomplete rows
        word = row[0].lower()  # Convert to lowercase
        vector = np.array(list(map(float, row[1:])))
        existing_vectors[word] = vector

# Load synonyms.csv
synonyms_df = pd.read_csv('/content/synonyms (2).csv', header=None, encoding='utf-8')
synonyms_df.rename(columns={0: 'Synonym'}, inplace=True)

# Extract synonym embeddings into a dictionary
synonym_embeddings = np.array(synonyms_df.iloc[:, 1:].values, dtype=np.float32)
synonyms = dict(zip(synonyms_df['Synonym'].str.lower(), synonym_embeddings))  # Convert synonyms to lowercase

# Function to find the closest synonym
def find_closest_synonym(predicted_embedding, synonyms):
    closest_word = None
    max_similarity = -1

    for word, vector in synonyms.items():
        similarity = cosine_similarity(predicted_embedding.reshape(1, -1), vector.reshape(1, -1))[0][0]
        if similarity > max_similarity:
            max_similarity = similarity
            closest_word = word

    print(f"Closest synonym found: {closest_word} (Similarity: {max_similarity})", flush=True)
    return closest_word

# Function to paraphrase a sentence
def paraphrase_sentence(sentence, word_vectors, synonyms, model):
    print(f"Paraphrasing sentence: {sentence}", flush=True)
    words = sentence.split()
    paraphrased_sentence = []

    for word in words:
        word_lower = word.lower()  # Ensure lowercase matching

        if word_lower in word_vectors:
            # Get parent embedding
            parent_embedding = word_vectors[word_lower]
            print(f"Embedding for '{word_lower}': {parent_embedding}", flush=True)

            try:
                # Predict synonym embedding
                predicted_synonym_embedding = model.predict(np.array([parent_embedding]))[0][0]
                print(f"Predicted embedding for '{word_lower}': {predicted_synonym_embedding}", flush=True)

                # Find closest synonym
                closest_synonym = find_closest_synonym(predicted_synonym_embedding, synonyms)

                # Replace with synonym if found
                paraphrased_sentence.append(closest_synonym if closest_synonym else word)
            except Exception as e:
                print(f"Error predicting synonym for '{word_lower}': {e}", flush=True)
                paraphrased_sentence.append(word)  # Keep original word on failure
        else:
            print(f"Word '{word_lower}' not found in word vectors, keeping it unchanged.", flush=True)
            paraphrased_sentence.append(word)

    paraphrased_text = ' '.join(paraphrased_sentence)
    print(f"Final paraphrased sentence: {paraphrased_text}", flush=True)
    return paraphrased_text

# Example usage
if __name__ == "__main__":
    print("Starting paraphrasing process...", flush=True)

    # Input sentence
    sentence = "The bank is near the bank of a river"

    # Paraphrase the sentence
    paraphrased_sentence = paraphrase_sentence(sentence, existing_vectors, synonyms, model)

    print(f'Original Sentence: {sentence}', flush=True)
    print(f'Paraphrased Sentence: {paraphrased_sentence}', flush=True)


Starting paraphrasing process...
Paraphrasing sentence: The bank is near the bank of a river
Word 'the' not found in word vectors, keeping it unchanged.
Embedding for 'bank': [ 0.31178635  0.055164    0.31884974  0.319938    0.07515985 -0.19541635
  0.3035846  -0.30206776 -0.30270803  0.29795843 -0.03657262 -0.27057752
  0.1260939  -0.0144297   0.16070908 -0.29281753 -0.16562846  0.055658
  0.17965874  0.17990425]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Predicted embedding for 'bank': [ 0.21307668  0.0778587   0.22855344  0.20203245  0.04104276 -0.15749526
  0.21100959 -0.20532925 -0.26424992  0.26136678 -0.0378166  -0.29118901
  0.10810449 -0.00463709  0.1972131  -0.31828505 -0.15027335  0.04805079
  0.14756846  0.2062249 ]
Closest synonym found: banking (Similarity: 0.9457366466522217)
Word 'is' not found in word vectors, keeping it unchanged.
Embedding for 'near': [ 0.11912069  0.22036478  0.1379675   0.23451321 -0.28102046 -0.17728224
  0.29865184 -0