<a href="https://colab.research.google.com/github/Hemankit/Drawing_with_llms/blob/main/Drawing_LLMs_Kaggle_comp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd


In [None]:
# reading the train data
train_df = pd.read_csv('/content/train.csv')
train_df.head()

Unnamed: 0,id,description
0,04c411,a starlit night over snow-covered peaks
1,215136,black and white checkered pants
2,3e2bc6,crimson rectangles forming a chaotic grid
3,61d7a8,burgundy corduroy pants with patch pockets and...
4,6f2ca7,orange corduroy overalls


In [None]:
train_df.tail()

Unnamed: 0,id,description
10,bf3306,magenta trapezoids layered on a transluscent s...
11,e2240f,gray wool coat with a faux fur collar
12,f02e39,a purple forest at dusk
13,f6790a,purple pyramids spiraling around a bronze cone
14,f9edd5,khaki triangles and azure crescents


In [None]:
test_df = pd.read_csv('/content/test.csv')
test_df.head()

Unnamed: 0,id,description
0,011af1,tan polygons and sky-blue arcs
1,147070,ginger ribbed dungarees
2,2fbf07,a beacon tower facing the sea
3,30f0fc,an expanse of white desert
4,4874c8,a violet wood as evening falls


In [None]:
test_df.tail()

Unnamed: 0,id,description
10,afdff1,fuchsia parallelograms over a shimmering tin s...
11,c584bb,chestnut ribbed pants with cargo pockets and p...
12,eb4aea,scarlet squares in a disordered array
13,ee3364,an emerald lake beneath an overcast sky
14,f1fdcc,ivory and ebony harlequin trousers


In [None]:
!pip install nltk
!pip install inflect



In [None]:
# text preprocessing
import re
import nltk
import inflect
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

p = inflect.engine()
# converting all numbers in text to words
def convert_num(text):
  return re.sub(r'\b\d+\b', lambda x: p.number_to_words(x.group(), text))

train_df['description'] = train_df['description'].fillna("").apply(convert_num)
test_df['description'] = test_df['description'].fillna("").apply(convert_num)

# tokenization and stop word removal
tokenizer = RegexpTokenizer(r"\w+(?:-\w+)*")
stop_words = set(stopwords.words('english'))

train_text_desc = [[w for w in tokenizer.tokenize(text) if w.lower() not in stop_words] for text in train_df['description']]
test_text_desc = [[w for w in tokenizer.tokenize(text) if w.lower() not in stop_words] for text in test_df['description']]





In [None]:
# text encodings of train_text_desc and test_text_desc
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# convert tokenized sentences to string
train_descriptions = [" ".join(tokens) for tokens in train_text_desc]
test_descriptions = [" ".join(tokens) for tokens in test_text_desc]

# creating bag-of-words model (word counts)
count_vectorizer = CountVectorizer()
count_train_matrix = count_vectorizer.fit_transform(train_descriptions)

# applying tfidf transformation
tfidf_transformer = TfidfTransformer()
tfidf_train_matrix = tfidf_transformer.fit_transform(train_descriptions)

# Transform test descriptions using the same CountVectorizer and TfidfTransformer
test_word_count_matrix = count_vectorizer.transform(test_descriptions)
test_tfidf_matrix = tfidf_transformer.transform(count_train_matrix)

# convert to dataframe
import pandas as pd
feature_names = count_vectorizer.get_feature_names_out()
test_tfidf_df = pd.DataFrame(test_tfidf_matrix.toarray(), columns=feature_names)

print(test_tfidf_df.head())





In [None]:
# text embeddings for train_descriptions and test_descriptions using pretrained word2vec
from gensim.models import KeyedVectors
google_w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

import numpy as np

# Function to get sentence embedding by averaging word embeddings
def get_sentence_embedding(sentence, model):
  word_vectors = [model.wv[word] for word in sentence if word in model.wv]
  return np.mean(word_vectors, axis=0) if word_vectors else np.zeroes(model.vector_size)


# Generate embeddings for train descriptions
train_embeddings = np.array([get_sentence_embedding(sentence, google_w2v) for sentence in train_text_desc])

# Generate embeddings for test descriptions
test_embeddings = np.array([get_sentence_embeddings(sentence, google_w2v) for sentence in test_text_desc])

print(f"Train Embeddings Shape: {train_embeddings.shape}")
print(f"test embeddings shape: {test_embeddings.shape}")




In [1]:
#| default_exp core

In [2]:
import kagglehub
import polars as pl

train_path = kagglehub.competition_download('drawing-with-llms', 'train.csv')
train = pl.read_csv(train_path)

train.head()

UnauthenticatedError: User is not authenticated

In [None]:
#| export

class Model:
    def __init__(self):
        '''Optional constructor, performs any setup logic, model instantiation, etc.'''
        pass

    def predict(self, prompt: str) -> str:
        '''Generates SVG which produces an image described by the prompt.

        Args:
            prompt (str): A prompt describing an image
        Returns:
            String of valid SVG code.
        '''
        # Renders a simple circle regardless of input
        return '<svg width="100" height="100" viewBox="0 0 100 100"><circle cx="50" cy="50" r="40" fill="red" /></svg>'

In [None]:
from IPython.display import SVG

model = Model()
svg = model.predict('a goose winning a gold medal')

print(svg)
display(SVG(svg))

In [None]:
import kaggle_evaluation

kaggle_evaluation.test(Model)