<a href="https://colab.research.google.com/github/Jan-HeinKok/machinelearning2/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers datasets scikit-learn torch

import pandas as pd

# Load the dataset
df = pd.read_csv('/content/quikr_car.csv')

# Inspect the first few rows of the DataFrame to understand its structure
print(df.head())
print(df.columns)


                                     name   company  year          Price  \
0    Hyundai Santro Xing XO eRLX Euro III   Hyundai  2007         80,000   
1                 Mahindra Jeep CL550 MDI  Mahindra  2006       4,25,000   
2              Maruti Suzuki Alto 800 Vxi    Maruti  2018  Ask For Price   
3  Hyundai Grand i10 Magna 1.2 Kappa VTVT   Hyundai  2014       3,25,000   
4        Ford EcoSport Titanium 1.5L TDCi      Ford  2014       5,75,000   

   kms_driven fuel_type  
0  45,000 kms    Petrol  
1      40 kms    Diesel  
2  22,000 kms    Petrol  
3  28,000 kms    Petrol  
4  36,000 kms    Diesel  
Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')


In [None]:
# Install necessary libraries
!pip install transformers datasets

# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the data
file_path = '/content/quikr_car.csv'
df = pd.read_csv(file_path)

# Preprocess data
def preprocess_data(df):
    df = df.dropna(subset=['Price'])
    df = df.rename(columns={'name': 'name', 'company': 'company', 'year': 'year', 'kms_driven': 'kms_driven', 'fuel_type': 'fuel_type', 'Price': 'Price'})
    df = df[['name', 'company', 'year', 'kms_driven', 'fuel_type', 'Price']]
    df = df.dropna()
    df = df[df['Price'] != 'Ask For Price']  # Remove rows with 'Ask For Price'
    df['Price'] = df['Price'].str.replace(',', '').astype(float)

    bins = [0, 300000, 600000, float('inf')]
    labels = ['cheap', 'affordable', 'expensive']
    df['label'] = pd.cut(df['Price'], bins=bins, labels=labels, right=False)
    df = df.dropna(subset=['label'])
    df = df.sample(frac=1).reset_index(drop=True)

    return df

df = preprocess_data(df)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['name'], df['label'], test_size=0.2, random_state=42)

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize the data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Create dataset objects
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increase the number of epochs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    learning_rate=2e-5,  # Experiment with different learning rates
    evaluation_strategy='epoch'
)

# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define compute metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"BERT Model Evaluation Results: {eval_results}")

# Define and train a baseline model for comparison
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Prepare data for baseline model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

# Train baseline model
baseline_model = MultinomialNB()
baseline_model.fit(X_train, train_labels)

# Evaluate baseline model
baseline_preds = baseline_model.predict(X_val)
baseline_accuracy = accuracy_score(val_labels, baseline_preds)
baseline_precision, baseline_recall, baseline_f1, _ = precision_recall_fscore_support(val_labels, baseline_preds, average='weighted')

print(f"Baseline Model Accuracy: {baseline_accuracy}")
print(f"Baseline Model Precision: {baseline_precision}")
print(f"Baseline Model Recall: {baseline_recall}")
print(f"Baseline Model F1 Score: {baseline_f1}")




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.068308,0.481707,0.370593,0.481707,0.418905
2,No log,0.947947,0.512195,0.597487,0.512195,0.425761
3,No log,0.829162,0.628049,0.713229,0.628049,0.616006
4,No log,0.76179,0.664634,0.684319,0.664634,0.671555
5,No log,0.720994,0.707317,0.730911,0.707317,0.711352


  _warn_prf(average, modifier, msg_start, len(result))


BERT Model Evaluation Results: {'eval_loss': 0.7209937572479248, 'eval_accuracy': 0.7073170731707317, 'eval_precision': 0.7309110641457703, 'eval_recall': 0.7073170731707317, 'eval_f1': 0.7113515415402207, 'eval_runtime': 0.2883, 'eval_samples_per_second': 568.833, 'eval_steps_per_second': 72.838, 'epoch': 5.0}
Baseline Model Accuracy: 0.6829268292682927
Baseline Model Precision: 0.687831110249377
Baseline Model Recall: 0.6829268292682927
Baseline Model F1 Score: 0.6787266640361045


In [None]:
# Install necessary libraries BASELINE MODEL to compare with the Fine-tuned BERT model ^^ above
!pip install transformers datasets scikit-learn torch

import pandas as pd
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

# Load and inspect the dataset
df = pd.read_csv('/content/quikr_car.csv')

# Print the column names and the first few rows of the DataFrame
print(df.columns)
print(df.head())

# Define preprocessing function
def preprocess_data(df):
    # Adjust these column names if they are different in your CSV file
    column_mapping = {
        'name': 'name',          # Adjust if the column name is different
        'company': 'company',    # Adjust if the column name is different
        'year': 'year',          # Adjust if the column name is different
        'kms_driven': 'kms_driven',  # Adjust if the column name is different
        'fuel_type': 'fuel_type',    # Adjust if the column name is different
        'Price': 'Price'            # Adjust if the column name is different
    }
    df = df.rename(columns=column_mapping)

    df = df[['name', 'company', 'year', 'kms_driven', 'fuel_type', 'Price']]
    df = df.dropna()
    df['Price'] = df['Price'].apply(lambda x: 'neutral' if x == 'Ask For Price' else x)
    df['Price'] = df['Price'].str.replace('₹', '').str.replace(',', '')

    # Convert 'Price' to numeric, invalid parsing will be set as NaN
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

    # Drop rows where 'Price' is NaN
    df = df.dropna(subset=['Price'])

    bins = [0, 300000, 600000, float('inf')]
    labels = ['cheap', 'affordable', 'expensive']
    df['label'] = pd.cut(df['Price'], bins=bins, labels=labels, right=False)
    df = df.dropna(subset=['label'])
    df = df.sample(frac=1).reset_index(drop=True)
    return df

df = preprocess_data(df)

# Encode labels
label_mapping = {'cheap': 0, 'affordable': 1, 'expensive': 2, 'neutral': 3}
df['label'] = df['label'].map(label_mapping)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['name'], df['label'], test_size=0.2)

# Fine-tune BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)

train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})
val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': val_labels})

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model('./fine_tuned_bert')

# Evaluate the fine-tuned BERT model
results = trainer.evaluate()
print("BERT Model Evaluation Results:", results)

# Baseline Model using Logistic Regression
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, train_labels)

baseline_preds = baseline_model.predict(X_val)
baseline_accuracy = accuracy_score(val_labels, baseline_preds)
baseline_precision, baseline_recall, baseline_f1, _ = precision_recall_fscore_support(val_labels, baseline_preds, average='weighted')

print(f"Baseline Model Accuracy: {baseline_accuracy}")
print(f"Baseline Model Precision: {baseline_precision}")
print(f"Baseline Model Recall: {baseline_recall}")
print(f"Baseline Model F1 Score: {baseline_f1}")


Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')
                                     name   company  year          Price  \
0    Hyundai Santro Xing XO eRLX Euro III   Hyundai  2007         80,000   
1                 Mahindra Jeep CL550 MDI  Mahindra  2006       4,25,000   
2              Maruti Suzuki Alto 800 Vxi    Maruti  2018  Ask For Price   
3  Hyundai Grand i10 Magna 1.2 Kappa VTVT   Hyundai  2014       3,25,000   
4        Ford EcoSport Titanium 1.5L TDCi      Ford  2014       5,75,000   

   kms_driven fuel_type  
0  45,000 kms    Petrol  
1      40 kms    Diesel  
2  22,000 kms    Petrol  
3  28,000 kms    Petrol  
4  36,000 kms    Diesel  


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0478,1.051968,0.493902,0.440165,0.493902,0.369756
2,0.952,0.80839,0.695122,0.703148,0.695122,0.651291
3,0.6303,0.815598,0.70122,0.705927,0.70122,0.679311


  _warn_prf(average, modifier, msg_start, len(result))


BERT Model Evaluation Results: {'eval_loss': 0.8155977129936218, 'eval_accuracy': 0.7012195121951219, 'eval_precision': 0.7059267229397277, 'eval_recall': 0.7012195121951219, 'eval_f1': 0.6793107442933226, 'eval_runtime': 0.2964, 'eval_samples_per_second': 553.393, 'eval_steps_per_second': 37.118, 'epoch': 3.0}
Baseline Model Accuracy: 0.774390243902439
Baseline Model Precision: 0.7734409899044045
Baseline Model Recall: 0.774390243902439
Baseline Model F1 Score: 0.7678332788267895
