In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Load the IMDb dataset or replace it with your own dataset
df = pd.read_csv('IMDB.csv')

In [2]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Split the dataset into features (X) and target (y)
X = df['review'].values
y = df['sentiment'].values

# Preprocess the text data using TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(X)


In [4]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf_clf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf_clf.predict(X_test)

In [5]:
# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print(classification_report(y_test, y_pred))


Accuracy: 0.6521739130434783
F1 Score: 0.6172360248447205
              precision    recall  f1-score   support

    negative       0.65      0.89      0.75        27
    positive       0.67      0.32      0.43        19

    accuracy                           0.65        46
   macro avg       0.66      0.60      0.59        46
weighted avg       0.66      0.65      0.62        46



In [6]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [8]:

# Load the IMDb dataset or replace it with your own dataset
df = pd.read_csv('IMDB.csv')

# Split the dataset into features (X) and target (y)
X = df['review'].values
y = df['sentiment'].values

# Encode the target variable into numerical labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Preprocess the text data using TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(X)


In [9]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the XGBClassifier
xgb_clf = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=6, learning_rate=0.1, n_estimators=100, n_jobs=-1, random_state=42)
xgb_clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_clf.predict(X_test)


In [10]:

# Decode the numerical labels back to original labels if needed
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.6739130434782609
F1 Score: 0.6685191238966981
              precision    recall  f1-score   support

    negative       0.70      0.78      0.74        27
    positive       0.62      0.53      0.57        19

    accuracy                           0.67        46
   macro avg       0.66      0.65      0.65        46
weighted avg       0.67      0.67      0.67        46



In [11]:
import numpy as np
from keras.utils import to_categorical
from keras import models
from keras import layers
from keras.datasets import imdb

# Load the IMDB dataset
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)

# Concatenate the training and testing data and targets
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

# Vectorize the data
def vectorize(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

data = vectorize(data)
targets = np.array(targets).astype("float32")

# Split the data into training and testing sets
test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]

# Create the model
model = models.Sequential()
model.add(layers.Dense(50, activation="relu", input_shape=(10000,)))
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

# Compile the model
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

# Train the model
results = model.fit(
    train_x,
    train_y,
    epochs=10,
    batch_size=500,
    validation_data=(test_x, test_y),
)

# Evaluate the model
print("accuracy:", np.mean(results.history['val_accuracy']))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
accuracy: 0.8859099924564362
