In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import tensorflow as tf

2024-08-05 17:04:28.572758: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load data from X and Y1 files
data = pd.read_csv("X.txt", sep="\t", header=None, names=["text"])
data["label"] = pd.read_csv("YL1.txt", sep="\t", header=None)

data = data[:200]

In [4]:
# Preprocess text data
import nltk
nltk.download('stopwords')
def preprocess_text(text):
  """
  Preprocesses text data for better feature extraction.
  """
  text = text.lower()  # Lowercase text
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  stop_words = stopwords.words('english')  # Load stopwords
  text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
  stemmer = SnowballStemmer("english")  # Initialize stemmer
  text = stemmer.stem(text)  # Apply stemming
  return text

data["text"] = data["text"].apply(preprocess_text)  # Apply preprocessing to all text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

In [6]:
# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [7]:
# Train a Logistic Regression model
classifier = LogisticRegression(solver="lbfgs", C=1.0)
classifier.fit(X_train_features, y_train)

In [8]:
# Evaluate model performance on test set
y_pred = classifier.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

Accuracy on test set: 0.5250


In [9]:
# Tune the regularization parameter (C)
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_features, y_train)
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_



Best parameters: {'C': 100}


In [10]:
# Evaluate performance with the best model
y_pred = best_model.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with best parameters: {accuracy:.4f}")

Accuracy with best parameters: 0.7250


In [11]:
from transformers import TFDistilBertModel

# Load the pre-trained model
model_name = "distilbert-base-uncased"
bert_model = TFDistilBertModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
2024-08-05 17:06:02.885805: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from t

In [12]:
# Define the classification head
from transformers import BertTokenizer, TFDistilBertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained(model_name)

num_labels = len(data['label'].unique())  # Assuming unique labels represent number of classes

model = TFDistilBertForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from 

In [13]:
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

# Training the model (fine-tuning)
optimizer = Adam(learning_rate=2e-5)
loss = CategoricalCrossentropy()

In [14]:
# Feature extraction using TF-IDF with max 500 features
vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [1]:
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.fit(X_train_features.toarray().astype(int), tf.keras.utils.to_categorical(y_train), batch_size=batch_size, epochs=epochs, verbose=1)

NameError: name 'model' is not defined

In [None]:
model.save('my_distilbert_model', save_format="tf")



2024-08-05 16:35:21.343254: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,768]
	 [[{{node inputs}}]]
2024-08-05 16:35:34.905644: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,768]
	 [[{{node inputs}}]]


In [91]:
# Evaluate performance on test set
encoded_test_input = tokenizer(X_test.tolist(), truncation=True, padding='max_length', return_tensors='tf')
y_pred = model.predict(encoded_test_input['input_ids'])




In [96]:
import numpy as np
y_pred_new = np.argmax(y_pred.logits, axis=1)

accuracy = accuracy_score(y_test, y_pred_new)
print(f"Accuracy on test set: {accuracy:.4f}")

print(num_labels)

Accuracy on test set: 0.0750
7


In [103]:
department_dict = {0: 'CS',1:'ECE',2:'Psychology',3:'MAE',4:'Civil',5:'Medical',6:'Biochemistry'}

# Predict labels for new documents
new_document = '''Social delves into the fascinating world of how our thoughts,
feelings, and behaviors are shaped by the presence of others. Imagine a group of friends discussing a controversial
movie. One person expresses a strong dislike, and others, eager for social approval, might subtly shift their
opinions to align with the initial critic. This phenomenon, known as conformity, highlights the powerful influence
of social pressure on individual judgment.'''

"""new_document = '''The rhythmic clang of a riveter echoed across the construction site as civil engineers
meticulously reviewed blueprints. Their keen eyes scanned the intricate network of steel beams destined 
to become a towering bridge. Precise calculations ensured the structure could withstand the weight of traffic
and the relentless push of wind. Below, a team meticulously poured concrete for the bridge's foundation,
their work a testament to the unseen yet crucial role civil engineering plays in shaping the world's 
infrastructure.'''"""

new_document_features = vectorizer.transform([new_document])
predicted_label = best_model.predict(new_document_features)[0]
print(f"Predicted label for the new document: {department_dict[predicted_label]}")

ValueError: X has 500 features, but LogisticRegression is expecting 10000 features as input.