<a href="https://colab.research.google.com/github/Ganuginni/Webmyne_Systems_Internship/blob/main/C_code_analysis_with_evaluation_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install -q transformers datasets evaluate

# ---- 1. Load sample C# code ----
sample_code = """
using System;
using System.Collections.Generic;

class Program {
    static void Main() {
        List<int> numbers = new List<int> { 1, 2, 2, 3, 4, 4 };
        List<int> unique = new List<int>();
        foreach (int num in numbers) {
            if (!unique.Contains(num)) {
                unique.Add(num);
            }
        }
        Console.WriteLine(string.Join(",", unique));
    }
}
"""

# ---- 2. Feature Extraction ----
import re

def extract_features(code):
    return {
        "Nested Loops": bool(re.search(r"for\s*\(.*?\)\s*{[^}]*for\s*\(", code, re.DOTALL)),
        "LINQ Usage": "from " in code and "select" in code,
        "Async Usage": "async" in code or "await" in code,
        "Try-Catch": "try" in code and "catch" in code,
        "Duplicate Check": "Contains" in code and "Add" in code,
    }

features = extract_features(sample_code)
print("✅ Features Detected:", features)

# ---- 3. Generate Suggestions using Transformers (Code Optimizer Prompt) ----
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# Use open model
model_id = "Salesforce/codegen-350M-mono"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_suggestion(code, features):
    prompt = f"""
    Given this C# code:
    {code}

    Features: {features}

    Please optimize the code. Explain reasoning, and show improved version if possible.
    """
    result = generator(prompt, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id)[0]['generated_text']
    return result

suggestion = generate_suggestion(sample_code, features)
print("\n🧠 Suggested Optimization:\n", suggestion)

# ---- 4. Evaluation Metrics ----
import evaluate
!pip install rouge_score
# Simulate two versions of the code
code_before = "List<int> unique = new List<int>(); foreach (int n in nums) { if (!unique.Contains(n)) unique.Add(n); }"
code_after  = "HashSet<int> unique = new HashSet<int>(nums);"

def evaluate_improvement(before, after):
    rouge = evaluate.load("rouge")
    results = rouge.compute(predictions=[after], references=[before])
    print("\n📊 Evaluation Metrics (ROUGE similarity):")
    for k, v in results.items():
        print(f"{k}: {v:.4f}")

evaluate_improvement(code_before, code_after)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m440.3/491.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/797M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/797M [00:00<?, ?B/s]

Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e


🧠 Suggested Optimization:
 
    Given this C# code:
    
using System;
using System.Collections.Generic;

class Program {
    static void Main() {
        List<int> numbers = new List<int> { 1, 2, 2, 3, 4, 4 };
        List<int> unique = new List<int>();
        foreach (int num in numbers) {
            if (!unique.Contains(num)) {
                unique.Add(num);
            }
        }
        Console.WriteLine(string.Join(",", unique));
    }
}


    Features: {'Nested Loops': False, 'LINQ Usage': False, 'Async Usage': False, 'Try-Catch': False, 'Duplicate Check': True}

    Please optimize the code. Explain reasoning, and show improved version if possible.
    """
    
from collections import namedtuple

def convert (string):
    """Convert the given string into a List<int>"""
    return [int(i) for i in string.split()]
    
def run (string = None):
    """Run the given input string"""
    if string is not None:
        assert string.split()[0] == "-"
        
    inputString = S

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]


📊 Evaluation Metrics (ROUGE similarity):
rouge1: 0.4000
rouge2: 0.1739
rougeL: 0.4000
rougeLsum: 0.4000
