In [4]:
import pandas as pd

# Load the Excel file with 'latin1' encoding, tab delimiter, and handle quoting
file_path = r'C:\Users\anton\OneDrive\Desktop\Hackathon\AI EarthHack Dataset (1).xlsx'
df = pd.read_excel(file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,problem,solution,Industry,Business Model,Quadrant
0,1,The construction industry is indubitably one o...,"Herein, we propose an innovative approach to m...",Construction,Design for Recycling (DFR),Easy to access and easy to process
1,20,The fast fashion industry promotes high consum...,I propose an innovative digital platform that ...,"Fashion, Technology",Design for Recycling (DFR),Easy to access and easy to process
2,9,One major global issue we face today is the su...,"My solution is an innovative Reloop - System, ...",Packaging and Logistics,Design for Recycling (DFR),Easy to access and easy to process
3,7,more than 130 Billon plastic bottles waste ann...,Bariq factory to recyle plastic bottels,Recycling and Waste Management,Design for Recycling (DFR),Hard to access but easy to process
4,2,"I'm sure you, like me, are feeling the heat - ...","Imagine standing on a green hill, not a single...",Renewable Energy,Design for Recycling (DFR),Hard to access but easy to process


In [74]:
from transformers import BartForConditionalGeneration, BartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import os

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        labels = self.tokenizer(
            label, 
            truncation=True,
            padding="max_length",
            max_length=16,
            return_tensors="pt"
        )["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": labels}

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Save the configuration and model weights to a new file in your fine-tuned model directory
model_path = r"C:\Users\anton\OneDrive\Desktop\Hackathon\bart_finetuned"
os.makedirs(model_path, exist_ok=True)  # Ensure the directory exists
new_config_file = os.path.join(model_path, "config.json")
model.config.save_pretrained(model_path)
model.save_pretrained(model_path)

# Prepare the training data
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df['solution'].tolist(), train_df['Industry '].tolist(), tokenizer)
val_dataset = CustomDataset(val_df['solution'].tolist(), val_df['Industry '].tolist(), tokenizer)

# Save tokenizer files directly in the model directory
tokenizer.save_pretrained(model_path)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_path,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    save_steps=500,
    num_train_epochs=3,
    logging_steps=200,
    logging_dir="./logs",
)

# Create a Seq2Seq Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

# Print actual vs predicted values for the validation set
model.eval()
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
all_predictions = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(trainer.args.device)
        outputs = model.generate(input_ids, max_length=16, num_beams=5, length_penalty=0.6)
        predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        all_predictions.extend(predictions)

# Print or use the predictions as needed
for idx, (solution, actual) in enumerate(zip(val_df['solution'], val_df['Industry '])):
    print(f"Example {idx + 1}:\nInput: {solution}\nActual: {actual}\nPredicted: {all_predictions[idx]}\n")

Step,Training Loss




Example 1:
Input: Herein, we propose an innovative approach to mitigate this problem: Modular Construction. This method embraces recycling and reuse, taking a significant stride towards a circular economy.   Modular construction involves utilizing engineered components in a manufacturing facility that are later assembled on-site. These components are designed for easy disassembling, enabling them to be reused in diverse projects, thus significantly reducing waste and conserving resources.  Not only does this method decrease construction waste by up to 90%, but it also decreases construction time by 30-50%, optimizing both environmental and financial efficiency. This reduction in time corresponds to substantial financial savings for businesses. Moreover, the modular approach allows greater flexibility, adapting to changing needs over time.  We believe, by adopting modular construction, the industry can transit from a 'take, make and dispose' model to a more sustainable 'reduce, reuse, a

In [75]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load fine-tuned BART model for industry prediction
industry_model_path = r"C:\Users\anton\OneDrive\Desktop\Hackathon\bart_finetuned"
industry_tokenizer = BartTokenizer.from_pretrained(industry_model_path, local_files_only=True)
industry_model = BartForConditionalGeneration.from_pretrained(industry_model_path)

def generate_prediction(prompt):
    # Tokenize and generate industry prediction using fine-tuned BART model
    industry_inputs = industry_tokenizer(prompt, return_tensors='pt', max_length=64, truncation=True)
    industry_outputs = industry_model.generate(**industry_inputs, max_length=16, num_beams=5, length_penalty=0.6)
    industry_prediction = industry_tokenizer.decode(industry_outputs[0], skip_special_tokens=True)

    return industry_prediction

# Define the prompt
prompt = "I propose an innovative digital platform that uses Augmented Reality (AR) technology for consumers to ""try on"" and virtually own clothing items. This would permit customers to experience wearing various fashion items without the need for physical ownership. Besides reducing waste and pollution associated with production and disposal, such a platform would provide unprecedented variety in virtual wardrobes, thereby catering to the consumer desire for constantly changing styles.  Users could pay a subscription fee to access a vast virtual fashion library, thus generating steady revenue for businesses. The AR technology can be continuously updated and scaled up, enabling expansion into new markets and styles. While upfront investment in technology development and updates may be significant, this will be offset by reduced production costs and increased customer reach. The environmental impact would be vastly reduced as clothing production would drop dramatically, effectively addressing one of the world's largest polluting industries. The novelty lies in the use of AR technology in this field, which still is a largely untapped area. This integrates technology with sustainable fashion practices, creating value for consumers, businesses, and most importantly, our planet."

# Generate prediction
industry_prediction = generate_prediction(prompt)

# Print the result
print(f"Prompt: {prompt}\nPrediction: {industry_prediction}")

Prompt: I propose an innovative digital platform that uses Augmented Reality (AR) technology for consumers to try on and virtually own clothing items. This would permit customers to experience wearing various fashion items without the need for physical ownership. Besides reducing waste and pollution associated with production and disposal, such a platform would provide unprecedented variety in virtual wardrobes, thereby catering to the consumer desire for constantly changing styles.  Users could pay a subscription fee to access a vast virtual fashion library, thus generating steady revenue for businesses. The AR technology can be continuously updated and scaled up, enabling expansion into new markets and styles. While upfront investment in technology development and updates may be significant, this will be offset by reduced production costs and increased customer reach. The environmental impact would be vastly reduced as clothing production would drop dramatically, effectively addressi

In [62]:
from transformers import BartForConditionalGeneration, BartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import os

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        labels = self.tokenizer(
            label, 
            truncation=True,
            padding="max_length",
            max_length=16,
            return_tensors="pt"
        )["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": labels}

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Save the configuration to a new file in your fine-tuned model directory
model_path = r"C:\Users\anton\OneDrive\Desktop\Hackathon\bart_finetuned"  # Update with the correct path to your fine-tuned model
os.makedirs(model_path, exist_ok=True)  # Ensure the directory exists
new_config_file = os.path.join(model_path, "config.json")
model.config.save_pretrained(new_config_file)

# Prepare the training data
# Assuming `df` is your DataFrame containing 'solution' and 'Business Model' columns
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df['solution'].tolist(), train_df['Business Model'].tolist(), tokenizer)
val_dataset = CustomDataset(val_df['solution'].tolist(), val_df['Business Model'].tolist(), tokenizer)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    save_steps=500,
    num_train_epochs=3,
    logging_steps=200,
    logging_dir="./logs",
)

# Create a Seq2Seq Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

# Print actual vs predicted values for the validation set
model.eval()
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
all_predictions = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(trainer.args.device)
        outputs = model.generate(input_ids, max_length=16, num_beams=5, length_penalty=0.6)
        predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        all_predictions.extend(predictions)

# Print or use the predictions as needed
for idx, (solution, actual) in enumerate(zip(val_df['solution'], val_df['Business Model'])):
    print(f"Example {idx + 1}:\nInput: {solution}\nActual: {actual}\nPredicted: {all_predictions[idx]}\n")

Step,Training Loss


Example 1:
Input: Herein, we propose an innovative approach to mitigate this problem: Modular Construction. This method embraces recycling and reuse, taking a significant stride towards a circular economy.   Modular construction involves utilizing engineered components in a manufacturing facility that are later assembled on-site. These components are designed for easy disassembling, enabling them to be reused in diverse projects, thus significantly reducing waste and conserving resources.  Not only does this method decrease construction waste by up to 90%, but it also decreases construction time by 30-50%, optimizing both environmental and financial efficiency. This reduction in time corresponds to substantial financial savings for businesses. Moreover, the modular approach allows greater flexibility, adapting to changing needs over time.  We believe, by adopting modular construction, the industry can transit from a 'take, make and dispose' model to a more sustainable 'reduce, reuse, a

In [63]:
from transformers import BartForConditionalGeneration, BartTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
import os

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            truncation=True,
            padding="max_length",
            max_length=64,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        labels = self.tokenizer(
            label, 
            truncation=True,
            padding="max_length",
            max_length=16,
            return_tensors="pt"
        )["input_ids"].squeeze()

        return {"input_ids": input_ids, "labels": labels}

# Load the BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Save the configuration to a new file in your fine-tuned model directory
model_path = r"C:\Users\anton\OneDrive\Desktop\Hackathon\bart_finetuned"  # Update with the correct path to your fine-tuned model
os.makedirs(model_path, exist_ok=True)  # Ensure the directory exists
new_config_file = os.path.join(model_path, "config.json")
model.config.save_pretrained(new_config_file)

# Prepare the training data
# Assuming `df` is your DataFrame containing 'solution' and 'Quadrant' columns
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_df['solution'].tolist(), train_df['Quadrant'].tolist(), tokenizer)
val_dataset = CustomDataset(val_df['solution'].tolist(), val_df['Quadrant'].tolist(), tokenizer)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    save_steps=500,
    num_train_epochs=3,
    logging_steps=200,
    logging_dir="./logs",
)

# Create a Seq2Seq Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Fine-tune the model
trainer.train()

# Print actual vs predicted values for the validation set
model.eval()
val_dataloader = DataLoader(val_dataset, batch_size=4, shuffle=False)
all_predictions = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(trainer.args.device)
        outputs = model.generate(input_ids, max_length=16, num_beams=5, length_penalty=0.6)
        predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        all_predictions.extend(predictions)

# Print or use the predictions as needed
for idx, (solution, actual) in enumerate(zip(val_df['solution'], val_df['Quadrant'])):
    print(f"Example {idx + 1}:\nInput: {solution}\nActual: {actual}\nPredicted: {all_predictions[idx]}\n")

Step,Training Loss


Example 1:
Input: Herein, we propose an innovative approach to mitigate this problem: Modular Construction. This method embraces recycling and reuse, taking a significant stride towards a circular economy.   Modular construction involves utilizing engineered components in a manufacturing facility that are later assembled on-site. These components are designed for easy disassembling, enabling them to be reused in diverse projects, thus significantly reducing waste and conserving resources.  Not only does this method decrease construction waste by up to 90%, but it also decreases construction time by 30-50%, optimizing both environmental and financial efficiency. This reduction in time corresponds to substantial financial savings for businesses. Moreover, the modular approach allows greater flexibility, adapting to changing needs over time.  We believe, by adopting modular construction, the industry can transit from a 'take, make and dispose' model to a more sustainable 'reduce, reuse, a