# Company Focus Classification (B2C or B2B)

In [1]:
import os

os.chdir('/Users/janlinzner/Projects/Master-Thesis-Spatial-Proximity-Venture-Capital')

In [2]:
from datasets import load_dataset
from datasets import DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import pandas as pd

In [3]:
data_files = {
    "train":   "data/business_orientation/companies_business_focus_save.csv",   # has Description & Industry-Specific VC Binary
    "predict": "data/business_orientation/companies_business_focus.csv"        # has Description only
}

ds = load_dataset("csv", data_files=data_files)

ds["train"] = ds["train"] \
    .rename_column("Description", "text") \
    .rename_column("B2B Binary", "label")

ds["predict"] = ds["predict"] \
    .rename_column("Description", "text")

In [4]:
model_name = "distilbert-base-uncased" 
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2                     
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def preprocess_train(batch):
    toks = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    toks["labels"] = batch["label"] 
    return toks

ds["train"] = ds["train"].map(
    preprocess_train,
    batched=True,
    remove_columns=[
        "Organization Name",
        "Organization Name URL",
        "text",
        "label"
    ]
)

train_test_split = ds["train"].train_test_split(test_size=0.2, seed=42)
ds = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"],
    "predict": ds["predict"]
})

In [6]:
def preprocess_predict(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

ds["predict"] = ds["predict"].map(
    preprocess_predict,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/17943 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="distilbert_finetuned_vc",
    logging_steps = 100 ,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="steps",       
    eval_steps=100,                
    save_strategy="steps",
    save_steps=100,               
    save_total_limit=1,          
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,  
    seed=42
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],  
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [10]:
trainer.train()
validation_metrics = trainer.evaluate()
print("Validation Loss:", validation_metrics["eval_loss"])



Step,Training Loss,Validation Loss
100,0.4709,0.346587
200,0.2831,0.358676
300,0.1688,0.373016




Validation Loss: 0.34658700227737427


In [None]:
preds = trainer.predict(ds["predict"])
pred_labels = preds.predictions.argmax(-1)

In [None]:
df = pd.read_csv("data/business_orientation/companies_business_focus.csv")
df["pred_label"] = pred_labels
df.to_csv("data/business_orientation/companies_business_focus_llm.csv", index=False)
print("✅ Done — predictions saved to companies_business_focus_llm.csv")

✅ Done — predictions saved to industry_focus_llm.csv


In [None]:
focus_path = 'data/business_orientation/companies_business_focus_save.csv'
existing = pd.read_csv(focus_path)

merged = companies.merge(
    existing[['Organization Name', 'Organization Name URL', 'B2B Binary']],
    on=['Organization Name', 'Organization Name URL'],
    how='left'
)

annot_df = merged[[
    'Organization Name',
    'Description',
    'Organization Name URL',
    'B2B Binary'
]].copy()

annot_df['B2B Binary'] = annot_df['B2B Binary'] \
    .map({1: '1', 0: '0'}) \
    .fillna('')

to_annotate = annot_df[annot_df['B2B Binary'] == '']

to_annotate.to_csv(
    'data/business_orientation/companies_business_focus.csv',
    index=False
)

In [None]:
company_focus = pd.read_csv('data/business_orientation/companies_business_focus_save.csv')

companies = companies.merge(
    company_focus[['Organization Name',
                    'Organization Name URL',
                    'B2B Binary']],
    on=['Organization Name', 'Organization Name URL'],
    how='left'
)

companies['B2B Binary'] = (
    companies['B2B Binary']
    .fillna(0)
    .astype(int)
    .astype('boolean')
)