In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
# Load the IMDB dataset
# download from here
data = pd.read_csv("Ar_review10k1.csv")

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('arabic'))
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

def preprocess_review(review):
    # Convert the review to lowercase
    review = review.lower()
    
    # Tokenize the review
    tokens = tokenizer.tokenize(review)
    
    # Remove stop words from the review
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the filtered tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the lemmatized tokens back into a single string
    preprocessed_review = ' '.join(lemmatized_tokens)
    
    return preprocessed_review

# Preprocess all the reviews in the dataset
data['text'] = data['text'].apply(preprocess_review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'review'

In [None]:

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

In [None]:

# Create a CountVectorizer object to convert the text data into numerical feature vectors
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer on the training data and transform it into numerical feature vectors
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [None]:

# Create a Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier on the training data
clf.fit(X_train_vectors, y_train)



In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred = clf.predict(X_test_vectors)

# Calculate the precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
