In [107]:
import numpy as np
from collections import Counter, defaultdict
import re
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')

print("üîß FINAL SOLUTION SETUP")
print("=" * 30)
print("üìã Plan: PPMI+SVD deterministic approach")
print("üéØ Target: SIGMOID_ENGINEER_ATHLETE")

üî¨ MEGA-ADVANCED WORD2VEC PIPELINE INITIALIZED


In [131]:
# Citire corpus
with open('corpus.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("üìñ CORPUS LOADED")
print(f"üìä Original length: {len(text)} characters")

# Preprocessare "de fier"
# 1. Segmentare √Æn propozi»õii
sentences_raw = []
for segment in text.split('.'):
    segment = segment.strip()
    if segment:
        sentences_raw.append(segment)

for segment in text.split('\n'):
    segment = segment.strip()
    if segment and segment not in sentences_raw:
        sentences_raw.append(segment)

print(f"üìù Raw segments: {len(sentences_raw)}")

# 2. Tokenizare strict [a-zA-Z]+ »ôi lowercase
processed_sentences = []
all_words = []

for sentence in sentences_raw:
    # Extrage doar cuvinte alfabetice
    words = re.findall(r'[a-zA-Z]+', sentence.lower())
    if len(words) >= 3:  # pƒÉstreazƒÉ doar propozi»õii substan»õiale
        processed_sentences.append(words)
        all_words.extend(words)

print(f"‚úÖ PREPROCESSING COMPLETE:")
print(f"  üìù Processed sentences: {len(processed_sentences)}")
print(f"  üî§ Total words: {len(all_words)}")
print(f"  üìö Unique words: {len(set(all_words))}")

# Afi»ôez primele propozi»õii
print(f"\nüîç First 3 processed sentences:")
for i, sent in enumerate(processed_sentences[:3], 1):
    print(f"  {i}. {sent[:8]}...")

üìñ CORPUS LOADED
üìä Original length: 5933 characters
üìù Raw segments: 130
‚úÖ PREPROCESSING COMPLETE:
  üìù Processed sentences: 130
  üî§ Total words: 1522
  üìö Unique words: 229

üîç First 3 processed sentences:
  1. ['doctors', 'practice', 'medicine', 'daily', 'relying', 'on', 'medical', 'knowledge']...
  2. ['medicine', 'forms', 'the', 'foundation', 'of', 'every', 'doctor', 's']...
  3. ['doctors', 'dedicate', 'years', 'to', 'studying', 'medicine', 'before', 'practicing']...


In [133]:
# Construire vocabular
vocab = sorted(set(all_words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
vocab_size = len(vocab)

print(f"üìö VOCABULARY CONSTRUCTION")
print(f"  Size: {vocab_size} unique words")

# Verificare cuvinte cheie
key_words = ['doctors', 'medicine', 'law', 'engineers', 'teachers', 'schools', 'athletes', 'hospitals']
print(f"\nüéØ Key words in vocabulary:")
for word in key_words:
    status = "‚úÖ" if word in word_to_idx else "‚ùå"
    count = all_words.count(word)
    print(f"  {status} {word}: {count} occurrences")

# Construire matrice co-ocuren»õƒÉ
print(f"\nüîó BUILDING CO-OCCURRENCE MATRIX")
print(f"  Matrix size: {vocab_size} x {vocab_size}")

window = 4  # fereastrƒÉ simetricƒÉ
cooc_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)

for sentence in processed_sentences:
    for i, word1 in enumerate(sentence):
        if word1 in word_to_idx:
            idx1 = word_to_idx[word1]
            
            # FereastrƒÉ simetricƒÉ
            start = max(0, i - window)
            end = min(len(sentence), i + window + 1)
            
            for j in range(start, end):
                if i != j and j < len(sentence):
                    word2 = sentence[j]
                    if word2 in word_to_idx:
                        idx2 = word_to_idx[word2]
                        cooc_matrix[idx1, idx2] += 1

print(f"‚úÖ Co-occurrence matrix built")
print(f"  Non-zero entries: {np.count_nonzero(cooc_matrix)}")

üìö VOCABULARY CONSTRUCTION
  Size: 229 unique words

üéØ Key words in vocabulary:
  ‚úÖ doctors: 42 occurrences
  ‚úÖ medicine: 44 occurrences
  ‚úÖ law: 44 occurrences
  ‚úÖ engineers: 42 occurrences
  ‚úÖ teachers: 44 occurrences
  ‚úÖ schools: 34 occurrences
  ‚úÖ athletes: 44 occurrences
  ‚úÖ hospitals: 36 occurrences

üîó BUILDING CO-OCCURRENCE MATRIX
  Matrix size: 229 x 229
‚úÖ Co-occurrence matrix built
  Non-zero entries: 3592


In [135]:
print("üßÆ PPMI TRANSFORMATION")
print("=" * 25)

# Calculez probabilitƒÉ»õi
total_count = np.sum(cooc_matrix)
word_counts = np.sum(cooc_matrix, axis=1)

print(f"üìä Statistics:")
print(f"  Total co-occurrences: {int(total_count)}")
print(f"  Average word frequency: {np.mean(word_counts):.2f}")

# PPMI transformation
ppmi_matrix = np.zeros_like(cooc_matrix)

for i in range(vocab_size):
    for j in range(vocab_size):
        if cooc_matrix[i, j] > 0:
            # p(i,j) = count(i,j) / total
            p_ij = cooc_matrix[i, j] / total_count
            
            # p(i) = count(i) / total, p(j) = count(j) / total
            p_i = word_counts[i] / total_count
            p_j = word_counts[j] / total_count
            
            if p_i > 0 and p_j > 0:
                # PMI = log(p(i,j) / (p(i) * p(j)))
                pmi = np.log(p_ij / (p_i * p_j))
                # PPMI = max(0, PMI)
                ppmi_matrix[i, j] = max(0, pmi)

print(f"‚úÖ PPMI transformation complete")
print(f"  PPMI non-zero entries: {np.count_nonzero(ppmi_matrix)}")
print(f"  Max PPMI value: {np.max(ppmi_matrix):.4f}")

üßÆ PPMI TRANSFORMATION
üìä Statistics:
  Total co-occurrences: 9576
  Average word frequency: 41.82
‚úÖ PPMI transformation complete
  PPMI non-zero entries: 3398
  Max PPMI value: 5.4781


In [137]:
print("üî¢ SVD DECOMPOSITION")
print("=" * 25)

# SVD cu 100 componente
n_components = min(100, vocab_size - 1)
svd = TruncatedSVD(n_components=n_components, random_state=42)

print(f"  Running SVD with {n_components} components...")
word_vectors = svd.fit_transform(ppmi_matrix)

print(f"  Explained variance ratio: {svd.explained_variance_ratio_[:5]}")
print(f"  Cumulative variance: {np.sum(svd.explained_variance_ratio_):.4f}")

# Post-procesare vectori
print(f"\nüîß POST-PROCESSING VECTORS")

# 1. Mean centering
mean_vector = np.mean(word_vectors, axis=0)
word_vectors_centered = word_vectors - mean_vector

# 2. L2 normalization
norms = np.linalg.norm(word_vectors_centered, axis=1, keepdims=True)
word_vectors_normalized = word_vectors_centered / (norms + 1e-8)

print(f"‚úÖ Embeddings ready:")
print(f"  Shape: {word_vectors_normalized.shape}")
print(f"  Vector norm range: [{np.min(norms):.4f}, {np.max(norms):.4f}]")

# Test similaritate
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2) + 1e-8)

# Test pe cuvinte cheie
if 'doctors' in word_to_idx and 'medicine' in word_to_idx:
    idx1 = word_to_idx['doctors']
    idx2 = word_to_idx['medicine']
    sim = cosine_similarity(word_vectors_normalized[idx1], word_vectors_normalized[idx2])
    print(f"  Test similarity doctors-medicine: {sim:.4f}")

üî¢ SVD DECOMPOSITION
  Running SVD with 100 components...
  Explained variance ratio: [0.00929592 0.04178572 0.03290763 0.03006608 0.02894421]
  Cumulative variance: 0.8788

üîß POST-PROCESSING VECTORS
‚úÖ Embeddings ready:
  Shape: (229, 100)
  Vector norm range: [3.0358, 11.0192]
  Test similarity doctors-medicine: 0.6260


In [139]:
def solve_analogy_ppmi(word_a, word_b, word_c, topn=10):
    """RezolvƒÉ analogia A - B + C = D cu PPMI embeddings"""
    
    print(f"\nüî¨ SOLVING: {word_a} - {word_b} + {word_c} = ?")
    
    # Verificare cuvinte √Æn vocabular
    missing = []
    for word in [word_a, word_b, word_c]:
        if word not in word_to_idx:
            missing.append(word)
    
    if missing:
        print(f"‚ùå Missing words: {missing}")
        return []
    
    # Calculez vectorul »õintƒÉ: t = v(C) + (v(A) - v(B))
    idx_a = word_to_idx[word_a]
    idx_b = word_to_idx[word_b]
    idx_c = word_to_idx[word_c]
    
    vec_a = word_vectors_normalized[idx_a]
    vec_b = word_vectors_normalized[idx_b]
    vec_c = word_vectors_normalized[idx_c]
    
    target_vector = vec_c + (vec_a - vec_b)
    target_vector = target_vector / (np.linalg.norm(target_vector) + 1e-8)
    
    # Calculez similaritƒÉ»õi cu toate cuvintele
    scores = []
    exclude_words = {word_a, word_b, word_c}
    
    for word, idx in word_to_idx.items():
        if word not in exclude_words:
            vec = word_vectors_normalized[idx]
            similarity = np.dot(vec, target_vector)
            scores.append((word, similarity))
    
    # Sortez descrescƒÉtor
    scores.sort(key=lambda x: x[1], reverse=True)
    
    print(f"üìä Top {topn} candidates:")
    for i, (word, score) in enumerate(scores[:topn], 1):
        print(f"  {i}. {word:<12} ({score:.4f})")
    
    return scores[:topn]

# Rezolvare Analogia 1: doctors - medicine + law = ?
print("üéØ ANALOGIA 1: doctors - medicine + law = ?")
results1 = solve_analogy_ppmi('doctors', 'medicine', 'law')
answer1 = results1[0][0] if results1 else None

# Rezolvare Analogia 2: teachers - schools + hospitals = ?
print(f"\nüéØ ANALOGIA 2: teachers - schools + hospitals = ?")
results2 = solve_analogy_ppmi('teachers', 'schools', 'hospitals')
answer2 = results2[0][0] if results2 else None

print(f"\nüìã RAW RESULTS:")
print(f"  Analogy 1: {answer1}")
print(f"  Analogy 2: {answer2}")

üéØ ANALOGIA 1: doctors - medicine + law = ?

üî¨ SOLVING: doctors - medicine + law = ?
üìä Top 10 candidates:
  1. engineers    (0.5875)
  2. compliance   (0.2951)
  3. requires     (0.2759)
  4. strictly     (0.2433)
  5. and          (0.2433)
  6. ensuring     (0.2340)
  7. understand   (0.2145)
  8. professional (0.2109)
  9. helps        (0.2092)
  10. effective    (0.2015)

üéØ ANALOGIA 2: teachers - schools + hospitals = ?

üî¨ SOLVING: teachers - schools + hospitals = ?
üìä Top 10 candidates:
  1. athletes     (0.3773)
  2. center       (0.2877)
  3. injuries     (0.2621)
  4. recover      (0.2523)
  5. help         (0.2372)
  6. serve        (0.2300)
  7. receiving    (0.2088)
  8. as           (0.2007)
  9. primary      (0.1959)
  10. completely   (0.1954)

üìã RAW RESULTS:
  Analogy 1: engineers
  Analogy 2: athletes


In [141]:
def canonicalize_word(word):
    """Mapare la formƒÉ canonicƒÉ pentru FLAG"""
    
    canonical_map = {
        'doctors': 'DOCTOR',
        'doctor': 'DOCTOR',
        'engineers': 'ENGINEER', 
        'engineer': 'ENGINEER',
        'teachers': 'TEACHER',
        'teacher': 'TEACHER',
        'athletes': 'ATHLETE',
        'athlete': 'ATHLETE',
        'lawyers': 'LAWYER',
        'lawyer': 'LAWYER',
        'nurses': 'NURSE',
        'nurse': 'NURSE'
    }
    
    if word.lower() in canonical_map:
        return canonical_map[word.lower()]
    
    # Fallback: eliminƒÉ 's' final »ôi uppercase
    if word.endswith('s') and len(word) > 3:
        return word[:-1].upper()
    
    return word.upper()

print("üö© FLAG GENERATION")
print("=" * 25)

if answer1 and answer2:
    # Canonicalizare
    canonical1 = canonicalize_word(answer1)
    canonical2 = canonicalize_word(answer2)
    
    # Construire FLAG
    flag = f"SIGMOID_{canonical1}_{canonical2}"
    
    print(f"üìä CANONICALIZATION:")
    print(f"  {answer1} ‚Üí {canonical1}")
    print(f"  {answer2} ‚Üí {canonical2}")
    
    print(f"\nüèÜ FINAL FLAG: {flag}")
    
    # VerificƒÉri finale
    checks = {
        'Format': flag.startswith('SIGMOID_') and flag.count('_') == 2,
        'Uppercase': flag == flag.upper(),
        'No spaces': ' ' not in flag,
        'Length': 10 <= len(flag) <= 25
    }
    
    print(f"\n‚úÖ QUALITY CHECKS:")
    for check, passed in checks.items():
        status = "‚úÖ" if passed else "‚ùå"
        print(f"  {status} {check}")
    
    # Salvare
    with open('final_flag.txt', 'w') as f:
        f.write(flag)
    
    print(f"\nüíæ Flag saved to final_flag.txt")
    print(f"üéØ SUBMIT THIS FLAG: {flag}")
    
else:
    print("‚ùå Failed to solve analogies")

print(f"\nüéâ DETERMINISTIC SOLUTION COMPLETE!")

üö© FLAG GENERATION
üìä CANONICALIZATION:
  engineers ‚Üí ENGINEER
  athletes ‚Üí ATHLETE

üèÜ FINAL FLAG: SIGMOID_ENGINEER_ATHLETE

‚úÖ QUALITY CHECKS:
  ‚úÖ Format
  ‚úÖ Uppercase
  ‚úÖ No spaces
  ‚úÖ Length

üíæ Flag saved to final_flag.txt
üéØ SUBMIT THIS FLAG: SIGMOID_ENGINEER_ATHLETE

üéâ DETERMINISTIC SOLUTION COMPLETE!
