<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/app_review_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Curl the Data and Upload it in the Working Directory**


In [None]:
!curl -X GET \
     "https://datasets-server.huggingface.co/rows?dataset=app_reviews&config=default&split=train&offset=0&length=100"

In [None]:
!curl -X GET \
     "https://datasets-server.huggingface.co/splits?dataset=app_reviews"

In [None]:
!curl -X GET \
     "https://huggingface.co/api/datasets/app_reviews/parquet/default/train"

In [None]:
# Import necessary libraries and functions
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.metrics import Precision
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
from sklearn.model_selection import train_test_split
import pickle

In [None]:
# Define necessary objects
precision_metric = Precision()
stemmer=PorterStemmer()
English_stopwords=stopwords.words('english')

**Exploratory Data Analysis**

In [None]:
# Read the parquet file in a pandas dataframe
df = pd.read_parquet('/content/0000.parquet')
# Dsiplay a part from the data
df.head()


In [None]:
# Data shape
df.shape

In [None]:
# Check nulls
df.isnull().sum()

In [None]:
# Find the number and the distribution of labels
df['star'].value_counts()

**Data Preprocessing**

In [16]:
features =df['review'].values
stars =df['star'].values

# Specify the desired number of instances for each class
desired_instances_per_class = {
    1: 50000,  # Desired number for class 1
    2: 50000,  # Desired number for class 2
    3: 50000,  # Desired number for class 3
    4: 50000,  # Desired number for class 4
    5:1743  # Desired number for class 5
}

# Initialize lists to store oversampled data
oversampled_features = []
oversampled_stars = []

# Loop through each class
for current_class in range(1, 6):  #  the ratings are from 1 to 5
    # Get indices of instances belonging to the current class
    indices = np.where(stars == current_class)[0]

    # Calculate the number of instances to duplicate to achieve the desired count
    num_to_duplicate = desired_instances_per_class[current_class] - len(indices)

    # Randomly duplicate instances
    if num_to_duplicate > 0:
        duplicated_indices = np.random.choice(indices, num_to_duplicate, replace=True)
        oversampled_features.extend(features[duplicated_indices])
        oversampled_stars.extend(stars[duplicated_indices])

# Convert lists to numpy arrays
oversampled_features = np.array(oversampled_features)
oversampled_stars = np.array(oversampled_stars)

# Create a DataFrame
df_new_samples= pd.DataFrame({'review': oversampled_features, 'star': oversampled_stars})
# Combine the original data with the created one
data = pd.concat([df_new_samples, df[['review','star']]], ignore_index=True)

In [20]:
# Define a function that process texts
def text_cleaner(text):
  text=text.lower()# Convert to lower cases
  text_with_no_punctuations = re.sub(r'[^a-zA-Z0-9]', ' ', text) # Remove non alphabatic symbols
  tokens=word_tokenize(text_with_no_punctuations) # tokeize words
  stemmed_text = [stemmer.stem(word) for word in tokens] # Apply stemming
  text = ' '.join(stemmed_text)
  text_with_no_stopwords=[word for word in text.split() if word not in English_stopwords]# remove english stopwords
  final_cleaned_text=' '.join(text_with_no_stopwords)
  return final_cleaned_text


In [21]:
# Define a function that returns the length of a text
def count_words(text):
  return len(text.split())

In [22]:
# Apply text_cleaner on review column
data['cleaned_review']=data['review'].apply(text_cleaner)

In [23]:
# Apply count_words function on the cleaned_review column
data['sentence_length']=data['cleaned_review'].apply(count_words)

In [None]:
# Get some statistics for the senetence_length column
data['sentence_length'].describe()

In [25]:
# Initialize the tokenizer object
tokenizer_object=Tokenizer(num_words=10000)
tokenizer_object.fit_on_texts(data['cleaned_review'])# fit on cleaned_review column

In [None]:
# Find the vocab_size
tokenizer_object.word_index
vocab_size=len(tokenizer_object.word_index)+1
print(vocab_size)

In [33]:
# Define the features and the labels (in array formats)
x=tokenizer_object.texts_to_sequences(data['cleaned_review'])
y=data['star'].values
padded_x= pad_sequences(x, maxlen=50, padding='post', truncating='post')# pad the arrays up to the same length

In [34]:
encoder=LabelEncoder()# Define encoder_object
encoded_y=encoder.fit_transform(y)# Encode the labels
y=to_categorical(encoded_y) # Perform one hot encoding

In [35]:
# Split the data into training and testing parts
x_train,x_test,y_train,y_test=train_test_split(padded_x,y,test_size=0.2,random_state=42)

**Modeling**

In [None]:
#Create the model and use LSTM layer
model=Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=50))
model.add(LSTM(125,return_sequences=False))
model.add(Dense(200,activation='relu'))
model.add(Dense(45,activation='relu'))
model.add(Dense(5,activation='softmax'))
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["acc",precision_metric])
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=64,epochs=20)

In [None]:
# Compile and fit the data into the model
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["acc",precision_metric])
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=64,epochs=20)

In [56]:
# Create a function for demo
def rate_function(text):
  # clean the input text
    cleaned_text = text_cleaner(text)
  # convert the cleaned text to a sequence of integers
    text_array = tokenizer_object.texts_to_sequences([cleaned_text])
  # pad the sequence
    padded_array = pad_sequences(text_array, maxlen=50, padding='post', truncating='post')
  # use the model created to generate predictions
    prediction = model.predict(padded_array)

    # Find the predicted class
    predicted_class = np.argmax(prediction)

    return predicted_class


In [64]:
text="very restricted app with poor resources."
rate_function(text)



0

In [None]:
# Save your model
model.save("model.h5")

In [66]:
import pickle
# Save the tokenizer to a file using pickle
with open('tokenizer_object.pkl', 'wb') as f:
    pickle.dump(tokenizer_object, f)