In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("sample_data.csv")

In [3]:
df = df.dropna()

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib

#lower casing the data
df['text'] = df['text'].str.lower()

#Converting text to vectors
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])

vectorizer_filename = "tfidf_vectorizer.pkl"
joblib.dump(vectorizer, vectorizer_filename)

#Encoding the labels to neumeric values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['label'])
encoder_filename = "l_encoder_vectorizer.pkl"
joblib.dump(label_encoder, encoder_filename)

#Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import joblib

# Defining different models to train
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(decision_function_shape='ovo'),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Training and evaluating each model
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {name}: {accuracy}")
    print(classification_report(y_test, y_pred))
    print("="*50)

    # Save the trained model using joblib
    model_filename = f"{name}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for Logistic Regression: 0.8065600215082672
              precision    recall  f1-score   support

           0       0.76      0.71      0.73       706
           1       0.84      0.72      0.77       513
           2       0.92      0.76      0.83      1022
           3       0.73      0.92      0.82      2281
           4       0.85      0.70      0.77      1009
           5       0.86      0.82      0.84      1908

    accuracy                           0.81      7439
   macro avg       0.83      0.77      0.79      7439
weighted avg       0.82      0.81      0.81      7439

Model saved as Logistic Regression_model.pkl
Training SVM...
Accuracy for SVM: 0.8204059685441591
              precision    recall  f1-score   support

           0       0.84      0.69      0.76       706
           1       0.85      0.78      0.81       513
           2       0.92      0.77      0.84      1022
           3       0.75      0.92      0.83      2281
           4       0.88      0.71  

In [4]:
import joblib
from sklearn.preprocessing import LabelEncoder


# Load your trained TF-IDF vectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Load your trained model
model = joblib.load('Gradient Boosting_model.pkl')

# Get input text from the user
text = input("Enter text: ")

# Transform the input text using the fitted vectorizer
new_text_features = vectorizer.transform([text])

# Make predictions
y_pred = model.predict(new_text_features)
encoder = joblib.load("l_encoder_vectorizer.pkl")
y_pred_text = encoder.inverse_transform(y_pred)
y_pred_text[0]

'mr'

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

# Assuming you have already preprocessed your dataset and split it into train and validation sets

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define your training and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len=128)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len=128)

# Define your BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

# Define training parameters
batch_size = 32
epochs = 5
learning_rate = 2e-5

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluation loop
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        correct_predictions = 0
        total_predictions = 0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            _, predicted_labels = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(predicted_labels == labels).item()
            total_predictions += labels.size(0)

    # Calculate metrics
    val_loss /= len(val_loader)
    val_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch+1}/{epochs}: Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")
