In [1]:
import gradio as gr
import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
from pathlib import Path
from nltk.tokenize import word_tokenize
import nltk
import chardet
import grpc
import zemberek_grpc.morphology_pb2 as z_morphology
import zemberek_grpc.morphology_pb2_grpc as z_morphology_g
from functools import lru_cache

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

from nltk.corpus import stopwords

# TÃ¼rkÃ§e stopwords'ler
turkish_stopwords = set(stopwords.words('turkish'))

# GeniÅŸletilmiÅŸ TÃ¼rkÃ§e stopwords listesi (ilk kodla aynÄ±)
extended_turkish_stopwords = {
    'a', 'acaba', 'acep', 'adamakÄ±llÄ±', 'adeta', 'ait', 'altmÄ±ÅŸ', 'altÄ±',
    'ama', 'amma', 'anca', 'ancak', 'arada', 'artÄ±k', 'aslÄ±nda', 'aynen', 'ayrÄ±ca',
    'az', 'aÃ§Ä±kÃ§a', 'aÃ§Ä±kÃ§asÄ±', 'bana', 'bari', 'bazen', 'bazÄ±', 'baÅŸkasÄ±',
    'belki', 'ben', 'benden', 'beni', 'benim', 'beri', 'beriki', 'beÅŸ',
    'bilcÃ¼mle', 'bile', 'bin', 'binaen', 'binaenaleyh', 'bir', 'biraz',
    'birazdan', 'birbiri', 'birden', 'birdenbire', 'biri', 'birice', 'birileri',
    'birisi', 'birkaÃ§', 'birkaÃ§Ä±', 'birkez', 'birlikte', 'birÃ§ok', 'birÃ§oÄŸu',
    'bir ÅŸey', 'bir ÅŸeyi', 'birÅŸey', 'birÅŸeyi', 'bitevi', 'biteviye',
    'bittabi', 'biz', 'bizatihi', 'bizce', 'bizcileyin', 'bizden', 'bize', 'bizi',
    'bizim', 'bizimki', 'bizzat', 'boÅŸuna', 'bu', 'buna', 'bunda', 'bundan',
    'bunlar', 'bunlarÄ±', 'bunlarÄ±n', 'bunu', 'bunun', 'buracÄ±kta', 'burada',
    'buradan', 'burasÄ±', 'bÃ¶yle', 'bÃ¶ylece', 'bÃ¶ylecene', 'bÃ¶ylelikle',
    'bÃ¶ylemesine', 'bÃ¶ylesine', 'bÃ¼sbÃ¼tÃ¼n', 'bÃ¼tÃ¼n', 'cuk', 'cÃ¼mlesi', 'da',
    'daha', 'dahi', 'dahil', 'dahilen', 'daima', 'dair', 'dayanarak', 'de', 'defa',
    'dek', 'demin', 'demincek', 'deminden', 'denli', 'derakap', 'derhal', 'derken',
    'deÄŸil', 'deÄŸin', 'diye', 'diÄŸer', 'diÄŸeri', 'doksan', 'dokuz',
    'dolayÄ±', 'dolayÄ±sÄ±yla', 'doÄŸru', 'dÃ¶rt', 'edecek', 'eden', 'ederek', 'edilecek',
    'ediliyor', 'edilmesi', 'ediyor', 'elbet', 'elbette', 'elli', 'emme', 'en',
    'enikonu', 'epey', 'epeyce', 'epeyi', 'esasen', 'esnasÄ±nda', 'etmesi', 'etraflÄ±',
    'etraflÃ­ca', 'etti', 'ettiÄŸi', 'ettiÄŸini', 'evleviyetle', 'evvel', 'evvela',
    'evvelce', 'evvelden', 'evvelemirde', 'evveli', 'eÄ‘er', 'eÄŸer', 'fakat',
    'filanca', 'gah', 'gayet', 'gayetle', 'gayri', 'gayrÄ±', 'gelgelelim', 'gene',
    'gerek', 'gerÃ§i', 'geÃ§ende', 'geÃ§enlerde', 'gibi', 'gibilerden', 'gibisinden',
    'gine', 'gÃ¶re', 'gÄ±rla', 'hakeza', 'halbuki', 'halen', 'halihazÄ±rda', 'haliyle',
    'handiyse', 'hangi', 'hangisi', 'hani', 'hariÃ§', 'hasebiyle', 'hasÄ±lÄ±', 'hatta',
    'hele', 'hem', 'henÃ¼z', 'hep', 'hepsi', 'her', 'herhangi', 'herkes', 'herkesin',
    'hiÃ§', 'hiÃ§bir', 'hiÃ§biri', 'hoÅŸ', 'hulasaten', 'iken', 'iki', 'ila', 'ile',
    'ilen', 'ilgili', 'ilk', 'illa', 'illaki', 'imdi', 'indinde', 'inen', 'insermi',
    'ise', 'ister', 'itibaren', 'itibariyle', 'itibarÄ±yla', 'iyi', 'iyice', 'iyicene',
    'iÃ§in', 'iÅŸ', 'iÅŸte', 'kadar', 'kaffesi', 'kah', 'kala', 'kannÄ±mca',
    'karÅŸÄ±n', 'katrilyon', 'kaynak', 'kaÃ§Ä±', 'kelli', 'kendi', 'kendilerine',
    'kendini', 'kendisi', 'kendisine', 'kendisini', 'kere', 'kez', 'keza',
    'kezalik', 'keÅŸke', 'keÅ£ke', 'ki', 'kim', 'kimden', 'kime', 'kimi', 'kimisi',
    'kimse', 'kimsecik', 'kimsecikler', 'kÃ¼lliyen', 'kÄ±rk',
    'kÄ±saca', 'lakin', 'leh', 'lÃ¼tfen', 'maada', 'madem', 'mademki', 'mamafih',
    'mebni', 'meÄŸer', 'meÄŸerki', 'meÄŸerse', 'milyar', 'milyon', 'mu',
    'mÃ¼', 'mi', 'mÄ±', 'nasÄ±l', 'nasÄ±lsa', 'nazaran', 'naÅŸi', 'ne', 'neden',
    'nedeniyle', 'nedenle', 'nedense', 'nerde', 'nerden', 'nerdeyse', 'nere',
    'nerede', 'nereden', 'neredeyse', 'neresi', 'nereye', 'netekim', 'neye', 'neyi',
    'neyse', 'nice', 'nihayet', 'nihayetinde', 'nitekim', 'niye', 'niÃ§in', 'o',
    'olan', 'olarak', 'oldu', 'olduklarÄ±nÄ±', 'oldukÃ§a', 'olduÄŸu', 'olduÄŸunu',
    'olmadÄ±', 'olmadÄ±ÄŸÄ±', 'olmak', 'olmasÄ±', 'olmayan', 'olmaz', 'olsa', 'olsun',
    'olup', 'olur', 'olursa', 'oluyor', 'on', 'ona', 'onca', 'onculayÄ±n', 'onda',
    'ondan', 'onlar', 'onlardan', 'onlarÄ±', 'onlarÄ±n', 'onu',
    'onun', 'oracÄ±k', 'oracÄ±kta', 'orada', 'oradan', 'oranca', 'oranla', 'oraya',
    'otuz', 'oysa', 'oysaki', 'pek', 'pekala', 'peki', 'pekÃ§e', 'peyderpey', 'raÄŸmen',
    'sadece', 'sahi', 'sahiden', 'sana', 'sanki', 'sekiz', 'seksen', 'sen', 'senden',
    'seni', 'senin', 'siz', 'sizden', 'sizi', 'sizin', 'sonra', 'sonradan',
    'sonralarÄ±', 'sonunda', 'tabii', 'tam', 'tamam', 'tamamen', 'tamamÄ±yla',
    'tarafÄ±ndan', 'tek', 'trilyon', 'tÃ¼m', 'var', 'vardÄ±', 'vasÄ±tasÄ±yla', 've',
    'velev', 'velhasÄ±l', 'velhasÄ±lÄ±kelam', 'veya', 'veyahut', 'ya', 'yahut',
    'yakinen', 'yakÄ±nda', 'yakÄ±ndan', 'yakÄ±nlarda', 'yalnÄ±z', 'yalnÄ±zca', 'yani',
    'yapacak', 'yapmak', 'yaptÄ±', 'yaptÄ±klarÄ±', 'yaptÄ±ÄŸÄ±', 'yaptÄ±ÄŸÄ±nÄ±', 'yapÄ±lan',
    'yapÄ±lmasÄ±', 'yapÄ±yor', 'yedi', 'yeniden', 'yenilerde', 'yerine',
    'yetmiÅŸ', 'yine', 'yirmi', 'yok', 'yoksa', 'yoluyla', 'yÃ¼z', 'yÃ¼zÃ¼nden',
    'zarfÄ±nda', 'zaten', 'zati', 'zira', 'Ã§abuk', 'Ã§abukÃ§a', 'Ã§eÅŸitli', 'Ã§ok',
    'Ã§oklarÄ±', 'Ã§oklarÄ±nca', 'Ã§okluk', 'Ã§oklukla', 'Ã§okÃ§a', 'Ã§oÄŸu', 'Ã§oÄŸun',
    'Ã§oÄŸunca', 'Ã§oÄŸunlukla', 'Ã§Ã¼nkÃ¼', 'Ã¶bÃ¼r', 'Ã¶bÃ¼rkÃ¼', 'Ã¶bÃ¼rÃ¼', 'Ã¶nce', 'Ã¶nceden',
    'Ã¶nceleri', 'Ã¶ncelikle', 'Ã¶teki', 'Ã¶tekisi', 'Ã¶yle', 'Ã¶ylece', 'Ã¶ylelikle',
    'Ã¶ylemesine', 'Ã¶z', 'Ã¼zere', 'Ã¼Ã§', 'ÅŸayet', 'ÅŸey', 'ÅŸeyden', 'ÅŸeyi', 'ÅŸeyler',
    'ÅŸu', 'ÅŸuna', 'ÅŸuncacÄ±k', 'ÅŸunda', 'ÅŸundan', 'ÅŸunlar', 'ÅŸunlarÄ±', 'ÅŸunu',
    'ÅŸunun', 'ÅŸura', 'ÅŸuracÄ±k', 'ÅŸuracÄ±kta', 'ÅŸurasÄ±', 'ÅŸÃ¶yle', 'ÅŸimdi', 'ÅŸÃ¶yle'
}

all_stopwords = turkish_stopwords.union(extended_turkish_stopwords)

In [3]:
# Zemberek GRPC baÄŸlantÄ±sÄ±
channel = grpc.insecure_channel('localhost:6789')
morphology_stub = z_morphology_g.MorphologyServiceStub(channel)

# Lemma ve KÃ¶k bulma fonksiyonlarÄ±
def get_lemmas(word):
    """Bir kelimenin lemmalarÄ±nÄ± (kÃ¶klerini) Zemberek ile bulur"""
    try:
        response = morphology_stub.AnalyzeWord(z_morphology.WordAnalysisRequest(input=word))
        if response.analyses:
            return response.analyses[0].lemmas
        return []
    except:
        return []

def get_stem(word):
    """Bir kelimenin kÃ¶kÃ¼nÃ¼ Zemberek ile bulur"""
    lemmas = get_lemmas(word)
    return lemmas[0] if lemmas else word
# Ã‡oklu kelimeler iÃ§in toplu kÃ¶k bulma (performans iyileÅŸtirmesi)
def get_stems_batch(words):
    """Kelime listesi iÃ§in kÃ¶k bulma iÅŸlemini toplu yapar"""
    results = {}
    for word in words:
        if word not in results:
            results[word] = get_stem(word)
    return results


In [4]:
# Model yÃ¼kleme
model_path = Path(r"C:\Users\Melek\yapayZeka\FilmTemaAnaliziProje\Models\tam_veri_seti_model4.pkl")
try:
    model = joblib.load(model_path)
    model_loaded = True
except:
    print(f"Model {model_path} konumundan yÃ¼klenemedi. Tahmin yapÄ±lmadan demo olarak Ã§alÄ±ÅŸacaktÄ±r.")
    model_loaded = False


In [5]:
# Temalar (ilk kodla aynÄ± sÄ±rada)
themes = [
    "romantik", "savaÅŸ", "bilim kurgu", "aksiyon", "dram", "fantastik", "gerilim", "suÃ§",
    "tarih", "mÃ¼zik", "komedi", "korku", "animasyon", "spor", "distoptik", "polisiye"
]

In [6]:
def clean_srt_file(file_path):
    """SRT dosyasÄ±nÄ± temizler (zaman damgalarÄ±nÄ± ve numaralarÄ± kaldÄ±rÄ±r)"""
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = []
    current_line = ""

    for line in lines:
        line = line.strip()
        # Zaman damgasÄ± satÄ±rlarÄ±nÄ± ve numaralandÄ±rmayÄ± atla
        if re.match(r'^\d+$', line) or '-->' in line:
            continue
        # BoÅŸ satÄ±rlarÄ± atla
        if not line:
            if current_line:  # EÄŸer biriktirilen bir satÄ±r varsa ekle
                cleaned_lines.append(current_line)
                current_line = ""
            continue

        # SatÄ±rlarÄ± birleÅŸtir (altyazÄ±larda cÃ¼mleler genelde birden fazla satÄ±ra bÃ¶lÃ¼nebilir)
        if current_line:
            current_line += " " + line
        else:
            current_line = line

    # Son satÄ±rÄ± eklemeyi unutma
    if current_line:
        cleaned_lines.append(current_line)

    return cleaned_lines

In [7]:
from nltk import sent_tokenize


def split_into_sentences(lines):
    """Metni cÃ¼mlelere bÃ¶ler"""
    sentences = []
    for line in lines:
        # NLTK'nÄ±n sent_tokenize fonksiyonunu kullanarak daha doÄŸru cÃ¼mle bÃ¶lme
        line_sentences = sent_tokenize(line)
        for sentence in line_sentences:
            sentence = sentence.strip()
            if sentence:
                sentences.append(sentence)
    return sentences


In [8]:
def clean_sentence(sentence):
    """CÃ¼mleyi temizler"""
    # AltyazÄ± notasyonlarÄ±nÄ± temizle ([gÃ¼lÃ¼ÅŸmeler], (fÄ±sÄ±ldar) gibi)
    sentence = re.sub(r'\[.*?\]|\(.*?\)', '', sentence)

    # HTML etiketlerini kaldÄ±r
    sentence = re.sub(r'<.*?>', '', sentence)

    # Noktalama iÅŸaretlerini kaldÄ±r (sadece kelimeler, sayÄ±lar ve boÅŸluklar kalsÄ±n)
    sentence = re.sub(r'[^\w\sÄŸÃ¼ÅŸÄ±Ã¶Ã§ÄžÃœÅžÄ°Ã–Ã‡0-9]', ' ', sentence)

    # KÃ¼Ã§Ã¼k harfe Ã§evir (sayÄ±lar etkilenmez)
    sentence = sentence.lower()

    # Fazla boÅŸluklarÄ± kaldÄ±r
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    return sentence

In [27]:
def remove_stopwords_and_stem(sentence):
    words = word_tokenize(sentence)
    processed_words = []
    
    for word in words:
        if word not in all_stopwords and (len(word) > 2 or word.isdigit() or re.match(r'^\w+-\d+\.?\d*$', word)):
            if not word.isdigit() and not re.match(r'^\w+-\d+\.?\d*$', word):
                stemmed_word = get_stem(word)
                processed_words.append(stemmed_word)
            else:
                processed_words.append(word)
    
    return ' '.join(processed_words)

In [9]:
def remove_stopwords_and_stem(sentence, stem_cache=None):
    """Stopwords kaldÄ±rÄ±r ve kÃ¶k bulma iÅŸlemi yapar"""
    if stem_cache is None:
        stem_cache = {}

    words = word_tokenize(sentence)
    processed_words = []

    # Cache'de olmayan kelimeler iÃ§in kÃ¶k bulma iÅŸlemi yap
    new_words = [w for w in words if w not in stem_cache and w not in all_stopwords
                 and (len(w) > 2 or w.isdigit() or re.match(r'^\w+-\d+\.?\d*$', w))]

    if new_words:
        new_stems = get_stems_batch(new_words)
        stem_cache.update(new_stems)

    # CÃ¼mledeki kelimeleri iÅŸle
    for word in words:
        if word not in all_stopwords and (len(word) > 2 or word.isdigit() or re.match(r'^\w+-\d+\.?\d*$', word)):
            if not word.isdigit() and not re.match(r'^\w+-\d+\.?\d*$', word):
                stemmed_word = stem_cache.get(word, get_stem(word))
                processed_words.append(stemmed_word)
            else:
                processed_words.append(word)

    return ' '.join(processed_words)

In [23]:
def create_chunks(sentences, chunk_size=25, stride=12):
    """EÄŸitimde kullanÄ±lan stratejiyle uyumlu overlap'li chunk'lar oluÅŸturur"""
    chunks = []
    for i in range(0, len(sentences) - chunk_size + 1, stride):
        chunk = sentences[i:i + chunk_size]
        chunks.append(' '.join(chunk))

    # Son kalan cÃ¼mleleri de ekle (eÄŸitimdeki gibi tamamlanmamÄ±ÅŸ chunk'larÄ± da kullan)
    if len(sentences) % chunk_size != 0:
        remaining = sentences[-(len(sentences) % chunk_size):]
        chunks.append(' '.join(remaining))
    return chunks

In [25]:
def predict_theme_distribution(chunks):
    if not model_loaded:
        return {theme: np.random.uniform(0, 1) for theme in themes}

    predictions = model.predict(chunks)

    theme_counts = {}
    for theme in themes:
        theme_counts[theme] = 0

    for pred in predictions:
        theme_counts[pred] = theme_counts.get(pred, 0) + 1

    total = len(predictions)
    theme_percentages = {theme: (count / total) * 100 for theme, count in theme_counts.items()}

    return theme_percentages


In [26]:
def plot_theme_distribution(theme_percentages):
    sorted_themes = sorted(theme_percentages.items(), key=lambda x: x[1], reverse=True)
    labels = [item[0] for item in sorted_themes]
    values = [item[1] for item in sorted_themes]

    plt.figure(figsize=(12, 8))
    bars = plt.bar(labels, values, color='skyblue')
    plt.xlabel('Film TemalarÄ±')
    plt.ylabel('YÃ¼zdelik (%)')
    plt.title('Film Tema DaÄŸÄ±lÄ±mÄ±')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2., height + 0.5,
                 f'{height:.1f}%', ha='center', va='bottom', rotation=0)

    return plt


In [27]:
from Frontend.film_analiz_arayuzu import process_subtitle_file


def read_file_with_auto_encoding(file_path):
    """Dosya karakter kodlamasÄ±nÄ± otomatik tespit ederek okur."""
    with open(file_path, 'rb') as f:
        raw_data = f.read()

    result = chardet.detect(raw_data)
    encoding = result['encoding']

    encodings_to_try = [
        encoding,
        'utf-8',
        'cp1254',
        'iso-8859-9',
        'latin-5',
        'iso-8859-1',
        'windows-1252'
    ]

    for enc in encodings_to_try:
        if enc is None:
            continue
        try:
            text = raw_data.decode(enc)
            print(f"{enc} kodlamasÄ±yla baÅŸarÄ±yla Ã§Ã¶zÃ¼ldÃ¼.")
            return text
        except UnicodeDecodeError:
            continue

    raise ValueError("Dosya bilinen hiÃ§bir karakter kodlamasÄ±yla Ã§Ã¶zÃ¼lemedi.")

def process_file(file):
    try:
        if hasattr(file, 'name'):
            file_content = read_file_with_auto_encoding(file.name)
        else:
            if isinstance(file, bytes):
                result = chardet.detect(file)
                encoding = result['encoding']
                try:
                    file_content = file.decode(encoding)
                except UnicodeDecodeError:
                    for enc in ['utf-8', 'cp1254', 'iso-8859-9', 'latin-5', 'iso-8859-1']:
                        try:
                            file_content = file.decode(enc)
                            break
                        except UnicodeDecodeError:
                            continue
                    else:
                        raise ValueError("Dosya hiÃ§bir yaygÄ±n kodlamayla Ã§Ã¶zÃ¼mlenemedi.")
            elif isinstance(file, str):
                file_content = file
            else:
                raise ValueError("Desteklenmeyen dosya formatÄ±. LÃ¼tfen metin tabanlÄ± bir SRT dosyasÄ± yÃ¼kleyin.")

        processed_sentences = process_subtitle_file(file_content)

        if not processed_sentences:
            return "Dosyada geÃ§erli bir metin bulunamadÄ±. LÃ¼tfen dosya formatÄ±nÄ± kontrol edin.", None

        chunks = create_chunks(processed_sentences)
        theme_percentages = predict_theme_distribution(chunks)
        fig = plot_theme_distribution(theme_percentages)

        report = "ðŸŽ¬ Film Tema Analizi SonuÃ§larÄ±:\n\n"
        for theme, percentage in sorted(theme_percentages.items(), key=lambda x: x[1], reverse=True):
            report += f"â€¢ {theme.capitalize()}: %{percentage:.1f}\n"

        return report, fig
    except Exception as e:
        return f"Bir hata oluÅŸtu: {str(e)}", None

In [28]:
# Gradio arayÃ¼zÃ¼ (orijinal kodla aynÄ±)
with gr.Blocks(title="Film Tema Analizi") as app:
    gr.Markdown("# ðŸŽ¬ Film Tema Analizi AracÄ±")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="AltyazÄ± DosyasÄ± YÃ¼kle (.srt veya .txt)", file_types=[".srt", ".txt"])
            analyze_btn = gr.Button("TemalarÄ± Analiz Et", variant="primary")

        with gr.Column():
            result_text = gr.Textbox(label="Analiz SonuÃ§larÄ±", lines=12, interactive=False)

    chart_output = gr.Plot(label="Tema DaÄŸÄ±lÄ±mÄ± GrafiÄŸi")

    analyze_btn.click(
        fn=process_file,
        inputs=[file_input],
        outputs=[result_text, chart_output]
    )

In [29]:
# UygulamayÄ± baÅŸlat
if __name__ == "__main__":
    app.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.
