In [13]:
!pip install datasets



In [14]:
# Step 1: Data Loading
from datasets import load_dataset

# Load the IMDB movie reviews dataset
dataset = load_dataset("imdb")

# Display dataset information
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [15]:
# Split the dataset into training, validation, and test sets
train_data = dataset['train']
validation_data = dataset['test'].train_test_split(test_size=0.5)['train']
test_data = dataset['test'].train_test_split(test_size=0.5)['test']


In [16]:
# Step 2: Data Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_data['text'])
y_train = train_data['label']

# Transform the validation and test data
X_val = tfidf_vectorizer.transform(validation_data['text'])
y_val = validation_data['label']
X_test = tfidf_vectorizer.transform(test_data['text'])
y_test = test_data['label']


In [12]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_data['text'])

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

In [17]:
# Step 3: Model Training
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)


In [18]:
# Step 4: Model Evaluation
from sklearn.metrics import accuracy_score, f1_score

# Predict labels for the test set
y_pred = model.predict(X_test)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)


Accuracy: 0.8816
F1 Score: 0.8816001636792146


In [19]:
# Step 5: Model Deployment with Pickle
import pickle

# Save the trained model using pickle
with open('text_classification_model.pkl', 'wb') as f:
    pickle.dump(model, f)
