In [11]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt
import fasttext
import gzip
import shutil

In [12]:
#!pip install fasttext-wheel

In [13]:

# Paths to the .gz file and the output .bin file
#gz_file_path = r"C:\Users\Rizvi\Desktop\FastText_BiLSTM\cc.bn.300.bin.gz"
bin_file_path = r"E:\Bangla-Sentiment-Analysis\Word Embeddings\cc.bn.300.bin"

# Extract the .bin file from the .gz archive
# with gzip.open(gz_file_path, 'rb') as f_in:
#     with open(bin_file_path, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

In [14]:


# Load the extracted FastText model
fasttext_model = fasttext.load_model(bin_file_path)


In [15]:

# # Load the preprocessed dataset
# df = pd.read_excel(r"C:\Users\Rizvi\Desktop\Bilstm_Bangla\data_fil_preprocessed.xlsx")

# df = df[['clean_sentence', 'Sentiment']]
# df2 = pd.read_excel(r"C:\Users\Rizvi\Desktop\Bilstm_Bangla\product_reviews_bn_translated.xlsx")
# df2.head()
# df2 = df2[['translated_sentence', 'Sentiment']]
# # Rename columns to have a common name for reviews
# df2 = df2.rename(columns={'translated_sentence': 'clean_sentence'})
# df = df.rename(columns={'clean_sentence': 'clean_sentence'})

# # Concatenate the DataFrames vertically
# all_reviews_df = pd.concat([df, df2], axis=0, ignore_index=True)

# # Print the shape of the concatenated DataFrame
# print("Shape of the concatenated DataFrame:", all_reviews_df.shape)
# df=all_reviews_df

# df.shape

df = pd.read_excel(r"E:\Bangla-Sentiment-Analysis\Bangla_Dataset\final_preprocessed_dataset.xlsx")

In [16]:
df.head()

Unnamed: 0,clean_sentence,Sentiment
0,তেমন ভালো না কিন্তু চলার মত আছে কিন্তু এই বাজে...,Negative
1,পন্যটা মোটামুটি বেশ ভালো,Positive
2,প্রোডাকটি ভালো নয় চার্জ একেবারে কম যায় ব্যাট...,Negative
3,পোডাক্ট মোটামুটি ভালোই বলা চলে কিন্তু ক্লিপ লা...,Negative
4,খুবি ভালো মেশিন ব্যবহার করার পর রিভিউ দিলাম,Positive


In [17]:

# Function to convert DataFrame to FastText format and save to file
def dataframe_to_fasttext(df, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:  # Specify encoding
        for index, row in df.iterrows():
            label = f"__label__{row['Sentiment']}"
            text = row['clean_sentence']
            f.write(f"{label} {text}\n")

# Define file paths
train_file = 'train.txt'
valid_file = 'valid.txt'

# Assuming df is already defined and contains your data
# Split your DataFrame into train and validation sets
train_df = df.sample(frac=0.8, random_state=42)  # 80% for training
valid_df = df.drop(train_df.index)  # Remaining 20% for validation

# Convert and save to text files
dataframe_to_fasttext(train_df, train_file)
dataframe_to_fasttext(valid_df, valid_file)

# Load the pre-trained FastText model
pretrained_model_path = r"E:\Bangla-Sentiment-Analysis\Word Embeddings\cc.bn.300.bin"  
model = fasttext.load_model(pretrained_model_path)

# Fine-tune the model on your dataset
model = fasttext.train_supervised(
    input=train_file,
    epoch=15,            # Number of epochs
    lr=0.75,              # Learning rate (consider trying lower values)
    wordNgrams=5,        # Use bigrams
    bucket=100000,       # Reduced bucket size for hashing
    dim=300,             # Keep vector dimensions same as pre-trained model
    loss='ova',      # Loss function
    verbose=2            # Verbosity level
)

# Save the fine-tuned model
model.save_model("fine_tuned_FastText_1.bin")

# Validate the model
num_examples, precision, recall = model.test(valid_file)
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")

# Predict the sentiment of a new sentence
sentence = "খুবি ভাল মেশিন   ব্যবহার করার পর রিভিউ দিলাম	"
labels, probabilities = model.predict(sentence)
print(f"Predicted label: {labels[0]}")
print(f"Probability: {probabilities[0]:.4f}")


Validation Precision: 0.6714
Validation Recall: 0.6714
Predicted label: __label__Positive
Probability: 0.9914


In [18]:
# Predict the sentiment of a new sentence
sentence = "খুবি ফালতু মেশিন   ব্যবহার করার পর রিভিউ দিলাম"
labels, probabilities = model.predict(sentence)
print(f"Predicted label: {labels[0]}")
print(f"Probability: {probabilities[0]:.4f}")


Predicted label: __label__Negative
Probability: 0.8222


In [19]:
import os

# Print the current working directory
print(r"E:\Bangla-Sentiment-Analysis\Word Embeddings", os.getcwd())

E:\Bangla Sentiment Analysis Thesis\Word Embeddings C:\Users\Rizvi


In [20]:
model_path = os.path.join(r"E:\Bangla-Sentiment-Analysis\Word Embeddings", "fine_tuned_FastText_1.bin")
fasttext_model.save_model(model_path)