In [2]:
import pandas as pd
import numpy as np

# Import Plotting Libararies
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data Preprocessing Libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


# Model Evaluation Libraries
from sklearn.metrics import classification_report, confusion_matrix 

In [3]:
df = pd.read_csv("../DataSets/IMDB_Urdu_Reviews/preprocessed.csv")

In [4]:
df.head()

Unnamed: 0,lemmatized_text,encoded_sentiments,tokens
0,دہائی وسط کیبل گائیڈ اسکائینجر ہنٹ پہلو اپیل ع...,1,"['دہائی', 'وسط', 'کیبل', 'گائیڈ', 'اسکائینجر',..."
1,دہائی انسپکٹر گیجٹ کارٹون پسند فلم دیکھنے پیسہ...,0,"['دہائی', 'انسپکٹر', 'گیجٹ', 'کارٹون', 'پسند',..."
2,معاشرے حالت تعجب والد پیدا البرٹ ٹی فٹزجیرالڈ ...,1,"['معاشرے', 'حالت', 'تعجب', 'والد', 'پیدا', 'ال..."
3,مفید البرٹ پیون ردی ٹوکری گریڈ زیڈ جلدی ٹم تھا...,0,"['مفید', 'البرٹ', 'پیون', 'ردی', 'ٹوکری', 'گری..."
4,کولمبو ہدایتکاری کیریئر ابتدائی وقت اسٹیون اسپ...,1,"['کولمبو', 'ہدایتکاری', 'کیریئر', 'ابتدائی', '..."


In [5]:
df[["lemmatized_text", "encoded_sentiments", "tokens"]].describe()

Unnamed: 0,encoded_sentiments
count,50000.0
mean,0.5
std,0.500005
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


# Vectorization

## Word2Vec Model

In [6]:
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [7]:
model_word2vec = Word2Vec(sentences=df["tokens"], vector_size=128, window=5, workers=10, min_count=1)

In [8]:
word_vectors = model_word2vec.wv
VOCAB_SIZE = len(word_vectors)
DIMENSIONS = word_vectors.vector_size

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.tokens)
encoded = tokenizer.texts_to_sequences(df.tokens)

In [13]:
MAX_LEN = max(map(len, encoded))
vectors = pad_sequences(encoded, padding='post', maxlen=MAX_LEN)

In [14]:
words2vec_matrix = np.zeros((VOCAB_SIZE, DIMENSIONS))
word_index = tokenizer.word_index
for word, index in word_index.items():
    if word in word_vectors:
        words2vec_matrix[index] = word_vectors[word]

In [29]:
vectors[746].shape

(1401,)

In [16]:
print(VOCAB_SIZE, DIMENSIONS, MAX_LEN)

197 128 1401


In [16]:
file = "word2vec-VS-"+str(VOCAB_SIZE)+"-D-"+str(DIMENSIONS)+"-ML-"+str(MAX_LEN)+".npz"

In [17]:
np.savez(file, sequences=vectors, labels=df.encoded_sentiments)

In [18]:
loaded_data = np.load(file)
X = loaded_data["sequences"]
Y = loaded_data["labels"]

In [19]:
X

array([[  127,   857,  1718, ...,     0,     0,     0],
       [  127,  3077,  4125, ...,     0,     0,     0],
       [  843,   892,  1732, ...,     0,     0,     0],
       ...,
       [    1,   472,   208, ...,     0,     0,     0],
       [   68,   298,     1, ...,     0,     0,     0],
       [15035,  3680,    12, ...,     0,     0,     0]])

## FastText

In [None]:
# import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore')  # English
# ft = fasttext.load_model('cc.en.300.bin')
# sentences = [sentence_tokenizer(text) for text in df['Review']]

In [19]:
# ftModel = FastText(df.tokens, vector_size=100, window=5, min_count=5, epochs=10)
# from keras.preprocessing.text import Tokenizer
# token = Tokenizer()
# token.fit_on_texts(df["tokens"])
# encoded = token.texts_to_sequences(df["tokens"])

In [22]:
# VOCAB_SIZE = len(ftModel.wv.key_to_index)
# DIMENSIONS = 128
# MAX_LEN = max([len(x) for x in df["tokens"]])

In [20]:
# ft_matrix = np.zeros((VOCAB_SIZE+1,DIMENSIONS))
# for word, index in token.word_index.items():
#     try:
#         ft_matrix[index] = ftModel.wv[word]
#     except:
#         print(index, word)

In [21]:
# from tensorflow.keras import preprocessing 
# train_vectors = preprocessing.sequence.pad_sequences(encoded,padding='post',dtype=int)

In [22]:
# loaded_data = np.load("word2Vec.npz")
# X = loaded_data["sequences"]
# Y = loaded_data["labels"]

## TF-IDF

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
vectorizer = TfidfVectorizer()

In [38]:
try:
    # Fit the vectorizer on the text data
    tfidf_vectors = vectorizer.fit_transform(df["tokens"])

    # Handle any specific exception if required
except Exception as e:
    print("An error occurred during vectorization:", str(e))

In [60]:
arr = tfidf_vectors.toarray()

MemoryError: Unable to allocate 37.9 GiB for an array with shape (50000, 101752) and data type float64

In [44]:
import joblib

In [49]:
joblib.dump((tfidf_vectors, df.encoded_sentiments), 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [54]:
vectorizer, labels = joblib.load('tfidf_vectorizer.pkl')

## N-gram

In [153]:
from sklearn.feature_extraction.text import CountVectorizer

In [155]:
vectorizer = CountVectorizer(ngram_range=(1, 2))# bigram (2-gram)

# Fit and transform the data
X = vectorizer.fit_transform(tokenized_texts)

In [165]:
with open("ngram_vectors.pkl", "wb") as file:
    pickle.dump((X, sentiment_labels), file)

In [166]:
with open("ngram_vectors.pkl", "rb") as file:
    X_loaded, sentiment_labels = pickle.load(file)