In [None]:
# Visualise Repo files
import requests
from IPython.display import display,Markdown
from github_pat import pat
from repo_visualizer import visualize_repo
from repo_visualizer import fetch_file_url

# Define the URL of the repository
repo_url = "https://api.github.com/repos/QwenLM/Qwen2.5-VL/contents"

# Call the function to visualize the repository structure
visualize_repo(repo_url, pat)

In [38]:
# Get file from repo and decode from base64 to utf-8
import os
import json
import base64

file_name=input("Input file name: ") 
fetched_file_urls = fetch_file_url(repo_url, file_name, pat)

fetched_file_urls_index=0
if (len(fetched_file_urls)) > 1:
     print(fetched_file_urls)
     fetched_file_urls_index = input("Input URL index:")
     assert fetched_file_urls_index.replace('.', '', 1).isdigit(), "Error: Input should be a number."
     fetched_file_urls_index = int(fetched_file_urls_index)

headers = {"Authorization": f"Bearer {pat}"}
response = requests.get(fetched_file_urls[fetched_file_urls_index], headers=headers)
name, extension = os.path.splitext(file_name)

with open(f"{name}.json", "w") as json_file:
     json_file.write(json.dumps(response.json(), indent=4))

with open(f"{name}.json", "r") as json_file:
    data = json.load(json_file)  # Load JSON as a Python dictionary

decoded_file = base64.b64decode(data["content"]).decode('utf-8')

['https://api.github.com/repos/QwenLM/Qwen2.5-VL/contents/README.md?ref=main', 'https://api.github.com/repos/QwenLM/Qwen2.5-VL/contents/qwen-vl-utils/README.md?ref=main']


In [39]:
# Generate prompt for gpt-4o-mini
from openai import OpenAI
from apikey import gpt_api

prompt_template = lambda decoded_file: f"""
You are a professional technical writer and summarization expert specializing in transforming complex documentation into concise, clear, and well-structured summaries. Your goal is to analyze the provided file from a repository and generate a professional summary.

### Guidelines:
1. **Relevance**:  
   - Focus on the **core purpose** of the file.  
   - Highlight key **features**, **functionalities**, and **intended use cases**.  
   - Identify the **target audience** or **users** of the file.

2. **Clarity**:  
   - Use **simple and precise language** to explain the file's purpose and usage.  
   - Avoid overly technical jargon unless necessary for understanding.  

3. **Organization**:  
   - Structure the summary with headings if appropriate (e.g., "Overview", "Key Features", "Usage").  
   - Ensure the output is **well-formatted** and easy to read.  

4. **Formatting**:  
   - Output the summary in **Markdown format**.  

5. **Important**:
   - l.
---

### Input:
- **file **:  
{decoded_file}

---

### Output:  
1. **Summary**:  
   - A **concise and well-structured summary** of the file.  
   - Highlights the file's **purpose**, **features**, and **usage**.  
   - Organized with headings for clarity and readability.

"""

In [None]:
# Calculate prompt tokens

prompt = prompt_template(decoded_file)
from tiktoken import encoding_for_model
import numpy as np

tokens = encoding_for_model('gpt-4o-mini').encode(decoded_file)
print(len(np.array(tokens)))


In [40]:

# setup api client
client = OpenAI(api_key=gpt_api)

# make api call
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Expert software engineer summarization for files"},
        {"role": "user", "content": prompt}
    ], 
    temperature = 0.7
)

# extract response
response_string = response.choices[0].message.content

In [42]:
display(Markdown(response_string))

# Qwen2.5-VL Summary

## Overview
Qwen2.5-VL is an advanced vision-language model designed to enhance interaction with visual and textual data. Building on the previous Qwen2-VL model, it introduces significant improvements in document parsing, video understanding, and object grounding, making it suitable for various applications involving multimodal data processing.

## Key Features
- **Document Parsing**: Enhanced capabilities for omnidocument parsing, allowing for effective processing of multi-scene, multilingual documents, including handwriting, tables, charts, and more.
- **Object Grounding**: Improved accuracy in detecting and counting objects with support for absolute coordinates and JSON formats for advanced spatial reasoning.
- **Video Understanding**: Capable of comprehending ultra-long videos with fine-grained event extraction, employing dynamic resolution in the temporal dimension.
- **Agent Functionality**: Enhanced decision-making and reasoning capabilities for computer and mobile applications, optimizing interaction with visual data.

## Model Architecture Updates
- **Dynamic Resolution**: Introduces dynamic FPS sampling for video understanding, allowing the model to learn temporal sequences and pinpoint specific moments effectively.
- **Efficient Vision Encoder**: Optimizations in the Vision Transformer (ViT) architecture improve training and inference speeds.

## Performance
Qwen2.5-VL models have shown superior performance across various benchmarks, outperforming previous models in several tasks, including document understanding and visual reasoning.

## Quickstart Guide
Users can easily integrate Qwen2.5-VL into their projects using the provided installation commands and example code snippets. Support for various input formats (images, video URLs, base64 encodings) facilitates flexible usage in applications.

## Target Audience
The Qwen2.5-VL model is intended for developers and researchers in the fields of machine learning, computer vision, and natural language processing looking to leverage multimodal capabilities in their applications.

## Additional Resources
- [Demo](https://huggingface.co/spaces/Qwen/Qwen2.5-VL)
- [Cookbooks](https://github.com/QwenLM/Qwen2.5-VL/tree/main/cookbooks) for practical examples and use cases.
- [API Documentation](https://help.aliyun.com/zh/model-studio/developer-reference/qwen-vl-api) for integrating the model into applications.

For detailed performance metrics and further insights into the model's capabilities, please refer to the [official blog](https://qwenlm.github.io/blog/qwen2.5-vl/) and the provided GitHub repository.