# Investor Focus Classification

In [None]:
import os

os.chdir('/Users/janlinzner/Projects/Master-Thesis-Spatial-Proximity-Venture-Capital')

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import pandas as pd

In [None]:
data_files = {
    "train":   "data/industry-focus/industry_focus_save.csv",   # has Description & Industry-Specific VC Binary
    "predict": "data/industry-focus/industry_focus.csv"        # has Description only
}

ds = load_dataset("csv", data_files=data_files)

ds["train"] = ds["train"] \
    .rename_column("Description", "text") \
    .rename_column("Industry-Specific VC Binary", "label")

ds["predict"] = ds["predict"] \
    .rename_column("Description", "text")

In [None]:
model_name = "distilbert-base-uncased" 
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2                     
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def preprocess_train(batch):
    toks = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    toks["labels"] = batch["label"] 
    return toks

ds["train"] = ds["train"].map(
    preprocess_train,
    batched=True,
    remove_columns=[
        "Organization/Person Name",
        "Organization/Person Name URL",
        "text",
        "label"
    ]
)

In [None]:
def preprocess_predict(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

ds["predict"] = ds["predict"].map(
    preprocess_predict,
    batched=True,
    remove_columns=["text"]
)

In [7]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="distilbert_finetuned_vc",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    logging_steps=50,
    save_total_limit=1
)

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [10]:
trainer.train()



Step,Training Loss
50,0.4915
100,0.3805
150,0.3244
200,0.2615
250,0.2604
300,0.2514
350,0.2394
400,0.2057
450,0.1959
500,0.2045




TrainOutput(global_step=987, training_loss=0.20840139422856324, metrics={'train_runtime': 465.4918, 'train_samples_per_second': 33.964, 'train_steps_per_second': 2.12, 'total_flos': 522318952900608.0, 'train_loss': 0.20840139422856324, 'epoch': 2.992412746585736})

In [11]:
preds = trainer.predict(ds["predict"])
pred_labels = preds.predictions.argmax(-1)  # array of 0s and 1s



In [15]:
df = pd.read_csv("data/industry-focus/industry_focus.csv")
df["pred_label"] = pred_labels
df.to_csv("data/industry-focus/industry_focus_llm.csv", index=False)
print("✅ Done — predictions saved to industry_focus_llm.csv")

✅ Done — predictions saved to industry_focus_llm.csv
